1pub mod serializer;
2
3use std::char;
4
5const XML_DECL_START: [char; 5] = ['<', '?', 'x', 'm', 'l'];
6const XML_DECL_VERSION: [char; 7] = ['v', 'e', 'r', 's', 'i', 'o', 'n'];
7const XML_DECL_VERSION_PREFIX: [char; 2] = ['1', '.'];
8const XML_DECL_ENCODING: [char; 8] = ['e', 'n', 'c', 'o', 'd', 'i', 'n', 'g'];
9const XML_DECL_STANDALONE: [char; 10] = ['s', 't', 'a', 'n', 'd', 'a', 'l', 'o', 'n', 'e'];
10const XML_DECL_END: [char; 2] = ['?', '>'];
11const YES: [char; 3] = ['y', 'e', 's'];
12const NO: [char; 2] = ['n', 'o'];
13const COMMENT_START: [char; 4] = ['<', '!', '-', '-'];
14const COMMENT_END: [char; 3] = ['-', '-', '>'];
15const CDATA_START: [char; 9] = ['<', '!', '[', 'C', 'D', 'A', 'T', 'A', '['];
16const CDATA_END: [char; 3] = [']', ']', '>'];
17const DOCTYPE_START: [char; 9] = ['<', '!', 'D', 'O', 'C', 'T', 'Y', 'P', 'E'];
18const DOCTYPE_END: char = STAG_END;
19const PI_START: [char; 2] = ['<', '?'];
20const PI_END: [char; 2] = XML_DECL_END;
21const EQUALS: char = '=';
22const SINGLE_QUOTE: char = '\'';
23const DOUBLE_QUOTE: char = '"';
24const STAG_START: char = '<';
25const STAG_END: char = '>';
26const ETAG_START: [char; 2] = ['<', '/'];
27const ETAG_END: char = STAG_END;
28const EMPTY_TAG_END: [char; 2] = ['/', '>'];
29const XMLNS: [char; 5] = ['x', 'm', 'l', 'n', 's'];
30const ENTITY_REFERENCE_START: char = '&';
31const HEXIDECIMAL_CHAR_REFERENCE_START: [char; 3] = ['&', '#', 'x'];
32const DECIMAL_CHAR_REFERENCE_START: [char; 2] = ['&', '#'];
33const REFERENCE_END: char = ';';
34const HYPHEN: char = '-';
35const COLON: char = ':';
36
37#[derive(Debug, PartialEq)]
40pub enum Token {
41 XMLDeclStart,
42 XMLVersion(XMLVersion),
43 XMLEncoding(EncName),
44 XMLStandalone(bool),
45 XMLDeclEnd,
46 DoctypeDeclStart,
47 DoctypeName(Name),
48 DoctypeDeclEnd,
49 Comment(Comment),
50 PIStart,
51 PITarget(PITarget),
52 PIData(PIData),
53 PIEnd,
54 ElementStart(QName),
55 ElementEmptyEnd,
56 ElementSTagEnd,
57 ElementEnd(QName),
58 AttributeStart,
59 AttributeName(QName),
60 AttributeValueStart, AttributeValue(AttributeValue),
62 AttributeValueEnd, AttributeEnd,
64 NamespaceStart,
65 NamespaceDefault,
66 NamespacePrefix(NCName),
67 NamespaceValue(NamespaceValue),
68 NamespaceEnd,
69 Text(Text),
70 CDATASection(CDATASection),
71 EntityRef(Name),
72 DecCharRef(DecCharRef),
73 HexCharRef(HexCharRef),
74}
75
76#[derive(Debug, PartialEq)]
87pub enum XMLVersion {
88 Version1_0,
89 Version1_1,
90}
91
92#[derive(Debug, PartialEq)]
99pub struct EncName {
100 enc_name: String,
101}
102impl EncName {
103 fn new_unvalidated(enc_name: String) -> EncName {
106 EncName { enc_name }
107 }
108
109 fn is_valid_start_char(c: char) -> bool {
113 match c {
114 'A'..='Z' => true,
115 'a'..='z' => true,
116 _ => false,
117 }
118 }
119
120 fn is_valid_char(c: char) -> bool {
124 match c {
125 'A'..='Z' => true,
126 'a'..='z' => true,
127 '0'..='9' => true,
128 '.' | '_' | '-' => true,
129 _ => false,
130 }
131 }
132
133 pub fn get_as_str(&self) -> &str {
135 &self.enc_name
136 }
137}
138
139#[derive(Debug, PartialEq)]
148pub struct Name {
149 name: String,
150}
151impl Name {
152 fn new_unvalidated(name: String) -> Name {
155 Name { name }
156 }
157
158 fn is_valid_start_char(c: char) -> bool {
162 match c {
163 'a'..='z' => true,
164 'A'..='Z' => true,
165 ':' | '_' => true,
166 '\u{C0}'..='\u{D6}' => true,
167 '\u{D8}'..='\u{F6}' => true,
168 '\u{F8}'..='\u{2FF}' => true,
169 '\u{370}'..='\u{37D}' => true,
170 '\u{37F}'..='\u{1FFF}' => true,
171 '\u{200C}'..='\u{200D}' => true,
172 '\u{2070}'..='\u{218F}' => true,
173 '\u{2C00}'..='\u{2FEF}' => true,
174 '\u{3001}'..='\u{D7FF}' => true,
175 '\u{F900}'..='\u{FDCF}' => true,
176 '\u{FDF0}'..='\u{FFFD}' => true,
177 '\u{10000}'..='\u{EFFFF}' => true,
178 _ => false,
179 }
180 }
181
182 fn is_valid_char(c: char) -> bool {
186 match c {
187 'a'..='z' => true,
188 'A'..='Z' => true,
189 ':' | '_' | '-' | '.' | '\u{B7}' => true,
190 '0'..='9' => true,
191 '\u{C0}'..='\u{D6}' => true,
192 '\u{D8}'..='\u{F6}' => true,
193 '\u{F8}'..='\u{2FF}' => true,
194 '\u{300}'..='\u{37D}' => true,
195 '\u{37F}'..='\u{1FFF}' => true,
196 '\u{200C}'..='\u{200D}' => true,
197 '\u{203F}'..='\u{2040}' => true,
198 '\u{2070}'..='\u{218F}' => true,
199 '\u{2C00}'..='\u{2FEF}' => true,
200 '\u{3001}'..='\u{D7FF}' => true,
201 '\u{F900}'..='\u{FDCF}' => true,
202 '\u{FDF0}'..='\u{FFFD}' => true,
203 '\u{10000}'..='\u{EFFFF}' => true,
204 _ => false,
205 }
206 }
207
208 pub fn get_as_str(&self) -> &str {
210 &self.name
211 }
212}
213
214#[derive(Debug, PartialEq)]
220pub struct Comment {
221 comment: String,
222}
223impl Comment {
224 fn new_unvalidated(comment: String) -> Comment {
227 Comment { comment }
228 }
229
230 fn is_valid_char_minus_hyphen(c: char, version: &XMLVersion) -> bool {
236 if c == HYPHEN {
237 return false;
238 }
239
240 is_xml_char(c, version)
241 }
242
243 pub fn get_as_str(&self) -> &str {
245 &self.comment
246 }
247}
248
249fn is_xml_char(c: char, version: &XMLVersion) -> bool {
255 match version {
256 XMLVersion::Version1_0 => match c {
257 '\u{9}' | '\u{A}' | '\u{D}' => true,
258 '\u{20}'..='\u{D7FF}' => true,
259 '\u{E000}'..='\u{FFFD}' => true,
260 '\u{10000}'..='\u{10FFFF}' => true,
261 _ => false,
262 },
263 XMLVersion::Version1_1 => match c {
264 '\u{1}'..='\u{D7FF}' => true,
265 '\u{E000}'..='\u{FFFD}' => true,
266 '\u{10000}'..='\u{10FFFF}' => true,
267 _ => false,
268 },
269 }
270}
271
272#[derive(Debug, PartialEq)]
281pub struct PITarget {
282 target: String,
283}
284impl PITarget {
285 fn new_unvalidated(target: String) -> PITarget {
288 PITarget { target }
289 }
290
291 fn is_valid_start_char(c: char) -> bool {
294 Name::is_valid_start_char(c)
295 }
296
297 fn is_valid_char(c: char) -> bool {
300 Name::is_valid_char(c)
301 }
302
303 pub fn get_as_str(&self) -> &str {
305 &self.target
306 }
307}
308
309#[derive(Debug, PartialEq)]
315pub struct PIData {
316 data: String,
317}
318impl PIData {
319 pub fn new_unvalidated(data: String) -> PIData {
322 PIData { data }
323 }
324
325 fn is_valid_start_char(c: char) -> bool {
330 is_whitespace(c)
331 }
332
333 fn is_valid_char(c: char, version: &XMLVersion) -> bool {
338 is_xml_char(c, version)
339 }
340
341 pub fn get_as_str(&self) -> &str {
343 &self.data
344 }
345}
346
347fn is_whitespace(c: char) -> bool {
350 match c {
351 '\u{9}' | '\u{A}' | '\u{D}' | '\u{20}' => true,
352 _ => false,
353 }
354}
355
356#[derive(Debug, PartialEq)]
362pub struct QName {
363 prefix: Option<String>,
364 local_part: String,
365}
366impl QName {
367 fn new_unvalidated(prefix: Option<String>, local_part: String) -> QName {
371 QName { prefix, local_part }
372 }
373
374 fn is_valid_start_char(c: char) -> bool {
378 NCName::is_valid_start_char(c)
379 }
380
381 fn is_valid_char(c: char) -> bool {
385 NCName::is_valid_char(c)
386 }
387
388 pub fn get_prefix_as_str(&self) -> Option<&str> {
390 match &self.prefix {
391 Some(prefix) => Some(&prefix),
392 None => None,
393 }
394 }
395
396 pub fn get_local_part_as_str(&self) -> &str {
398 &self.local_part
399 }
400}
401
402#[derive(Debug, PartialEq)]
406pub struct AttributeValue {
407 value: String,
408}
409impl AttributeValue {
410 fn new_unvalidated(value: String) -> AttributeValue {
414 AttributeValue { value }
415 }
416
417 fn is_valid_inside_single_quotes_char(c: char) -> bool {
420 match c {
421 STAG_START | ENTITY_REFERENCE_START | SINGLE_QUOTE => false,
422 _ => true,
423 }
424 }
425
426 fn is_valid_inside_double_quotes_char(c: char) -> bool {
429 match c {
430 STAG_START | ENTITY_REFERENCE_START | DOUBLE_QUOTE => false,
431 _ => true,
432 }
433 }
434
435 pub fn get_as_str(&self) -> &str {
437 &self.value
438 }
439}
440
441#[derive(Debug, PartialEq)]
447pub struct NCName {
448 nc_name: String,
449}
450impl NCName {
451 fn new_unvalidated(nc_name: String) -> NCName {
455 NCName { nc_name }
456 }
457
458 fn is_valid_start_char(c: char) -> bool {
462 match c {
463 'a'..='z' => true,
464 'A'..='Z' => true,
465 '_' => true,
466 '\u{C0}'..='\u{D6}' => true,
467 '\u{D8}'..='\u{F6}' => true,
468 '\u{F8}'..='\u{2FF}' => true,
469 '\u{370}'..='\u{37D}' => true,
470 '\u{37F}'..='\u{1FFF}' => true,
471 '\u{200C}'..='\u{200D}' => true,
472 '\u{2070}'..='\u{218F}' => true,
473 '\u{2C00}'..='\u{2FEF}' => true,
474 '\u{3001}'..='\u{D7FF}' => true,
475 '\u{F900}'..='\u{FDCF}' => true,
476 '\u{FDF0}'..='\u{FFFD}' => true,
477 '\u{10000}'..='\u{EFFFF}' => true,
478 _ => false,
479 }
480 }
481
482 fn is_valid_char(c: char) -> bool {
486 match c {
487 'a'..='z' => true,
488 'A'..='Z' => true,
489 '_' | '-' | '.' | '\u{B7}' => true,
490 '0'..='9' => true,
491 '\u{C0}'..='\u{D6}' => true,
492 '\u{D8}'..='\u{F6}' => true,
493 '\u{F8}'..='\u{2FF}' => true,
494 '\u{300}'..='\u{37D}' => true,
495 '\u{37F}'..='\u{1FFF}' => true,
496 '\u{200C}'..='\u{200D}' => true,
497 '\u{203F}'..='\u{2040}' => true,
498 '\u{2070}'..='\u{218F}' => true,
499 '\u{2C00}'..='\u{2FEF}' => true,
500 '\u{3001}'..='\u{D7FF}' => true,
501 '\u{F900}'..='\u{FDCF}' => true,
502 '\u{FDF0}'..='\u{FFFD}' => true,
503 '\u{10000}'..='\u{EFFFF}' => true,
504 _ => false,
505 }
506 }
507
508 pub fn get_as_str(&self) -> &str {
510 &self.nc_name
511 }
512}
513
514#[derive(Debug, PartialEq)]
518pub struct NamespaceValue {
519 value: String,
520}
521impl NamespaceValue {
522 fn new_unvalidated(value: String) -> NamespaceValue {
525 NamespaceValue { value }
526 }
527
528 fn is_valid_char(c: char, version: &XMLVersion) -> bool {
532 match version {
533 XMLVersion::Version1_0 => match c {
534 ':' | '/' | '?' | '#' | '[' | ']' | '@' => true,
536 '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => true,
538 'a'..='z' => true,
540 'A'..='Z' => true,
541 '0'..='9' => true,
542 '-' | '.' | '_' | '~' => true,
543 _ => false,
544 },
545 XMLVersion::Version1_1 => match c {
546 ':' | '/' | '?' | '#' | '[' | ']' | '@' => true,
548 '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | ';' | '=' => true,
550 'a'..='z' => true,
552 'A'..='Z' => true,
553 '0'..='9' => true,
554 '-' | '.' | '_' | '~' => true,
555 '\u{A0}'..='\u{D7FF}' => true,
557 '\u{F900}'..='\u{FDCF}' => true,
558 '\u{FDF0}'..='\u{FFEF}' => true,
559 '\u{10000}'..='\u{1FFFD}' => true,
560 '\u{20000}'..='\u{2FFFD}' => true,
561 '\u{30000}'..='\u{3FFFD}' => true,
562 '\u{40000}'..='\u{4FFFD}' => true,
563 '\u{50000}'..='\u{5FFFD}' => true,
564 '\u{60000}'..='\u{6FFFD}' => true,
565 '\u{70000}'..='\u{7FFFD}' => true,
566 '\u{80000}'..='\u{8FFFD}' => true,
567 '\u{90000}'..='\u{9FFFD}' => true,
568 '\u{A0000}'..='\u{AFFFD}' => true,
569 '\u{B0000}'..='\u{BFFFD}' => true,
570 '\u{C0000}'..='\u{CFFFD}' => true,
571 '\u{D0000}'..='\u{DFFFD}' => true,
572 '\u{E1000}'..='\u{EFFFD}' => true,
573 '\u{E000}'..='\u{F8FF}' => true,
575 '\u{F0000}'..='\u{FFFFD}' => true,
576 '\u{100000}'..='\u{10FFFD}' => true,
577 _ => false,
578 },
579 }
580 }
581
582 pub fn get_as_str(&self) -> &str {
584 &self.value
585 }
586}
587
588#[derive(Debug, PartialEq)]
594pub struct Text {
595 text: String,
596}
597impl Text {
598 fn new_unvalidated(text: String) -> Text {
602 Text { text }
603 }
604
605 fn is_valid_char(c: char) -> bool {
607 match c {
611 STAG_START | ENTITY_REFERENCE_START => false,
612 _ => true,
613 }
614 }
615
616 pub fn get_as_str(&self) -> &str {
618 &self.text
619 }
620
621 pub fn normalize_space(&self) -> String {
625 let collection: Vec<&str> = self.text.split_whitespace().collect();
626 collection.join(" ")
627 }
628
629 pub fn deduplicate_whitespace(&self) -> String {
632 let normalized_space = self.normalize_space();
633
634 if self.text.len() == 0 {
635 return normalized_space;
636 } else if normalized_space.len() == 0 {
637 return String::from(" ");
638 }
639
640 let whitespace_head: bool;
641 if is_whitespace(self.text.chars().next().unwrap()) {
642 whitespace_head = true;
643 } else {
644 whitespace_head = false;
645 }
646
647 let whitespace_tail: bool;
648 if is_whitespace(self.text.chars().last().unwrap()) {
649 whitespace_tail = true;
650 } else {
651 whitespace_tail = false;
652 }
653
654 if whitespace_head && whitespace_tail {
655 return format!(" {} ", normalized_space);
656 } else if whitespace_head {
657 return format!(" {}", normalized_space);
658 } else if whitespace_tail {
659 return format!("{} ", normalized_space);
660 } else {
661 return normalized_space;
662 }
663 }
664
665 pub fn normalize_space_deduplicate_head(&self) -> String {
668 let normalized_space = self.normalize_space();
669
670 if self.text.len() == 0 {
671 return normalized_space;
672 } else if normalized_space.len() == 0 {
673 return String::from(" ");
674 }
675
676 if is_whitespace(self.text.chars().next().unwrap()) {
677 return format!(" {}", normalized_space);
678 } else {
679 return normalized_space;
680 }
681 }
682
683 pub fn normalize_space_deduplicate_tail(&self) -> String {
686 let normalized_space = self.normalize_space();
687
688 if self.text.len() == 0 {
689 return normalized_space;
690 } else if normalized_space.len() == 0 {
691 return String::from(" ");
692 }
693
694 if is_whitespace(self.text.chars().last().unwrap()) {
695 return format!("{} ", normalized_space);
696 } else {
697 return normalized_space;
698 }
699 }
700}
701#[cfg(test)]
702mod text_tests {
703 use super::*;
704
705 #[test]
706 fn normalize_space() {
707 let text = Text::new_unvalidated(String::from(" a b c "));
708 assert_eq!(text.normalize_space(), String::from("a b c"));
709 }
710
711 #[test]
712 fn deduplicate_whitespace() {
713 let text = Text::new_unvalidated(String::from(" a b c "));
714 assert_eq!(text.deduplicate_whitespace(), String::from(" a b c "));
715 }
716
717 #[test]
718 fn normalize_space_deduplicate_head() {
719 let text = Text::new_unvalidated(String::from(" a b c "));
720 assert_eq!(
721 text.normalize_space_deduplicate_head(),
722 String::from(" a b c")
723 );
724 }
725
726 #[test]
727 fn normalize_space_deduplicate_tail() {
728 let text = Text::new_unvalidated(String::from(" a b c "));
729 assert_eq!(
730 text.normalize_space_deduplicate_tail(),
731 String::from("a b c ")
732 );
733 }
734}
735
736#[derive(Debug, PartialEq)]
742pub struct CDATASection {
743 data: String,
744}
745impl CDATASection {
746 fn new_unvalidated(data: String) -> CDATASection {
750 CDATASection { data }
751 }
752
753 fn is_valid_char(c: char, version: &XMLVersion) -> bool {
757 is_xml_char(c, version)
758 }
759
760 pub fn get_as_str(&self) -> &str {
762 &self.data
763 }
764}
765
766#[derive(Debug, PartialEq)]
772pub struct DecCharRef {
773 character: char,
774}
775impl DecCharRef {
776 fn new_unvalidated(character: char) -> DecCharRef {
780 DecCharRef { character }
781 }
782
783 pub fn new_from_string(
788 dec_code: String,
789 version: &XMLVersion,
790 ) -> Result<DecCharRef, ParseTokenError> {
791 match dec_code.parse::<u32>() {
792 Ok(u32_value) => match char::from_u32(u32_value) {
793 Some(c) => {
794 if is_xml_char(c, version) {
795 return Ok(DecCharRef::new_unvalidated(c));
796 }
797 }
798 None => {}
799 },
800 _ => {}
801 }
802
803 Err(ParseTokenError::new(ParseTokenErrorKind::DecCharRef))
804 }
805
806 pub fn get_as_char(&self) -> char {
808 self.character
809 }
810
811 pub fn get_as_u32(&self) -> u32 {
813 self.character as u32
814 }
815}
816#[cfg(test)]
817mod dec_char_ref_tests {
818 use super::*;
819
820 #[test]
821 fn new_from_string_test() {
822 let result = DecCharRef::new_from_string(String::from("169"), &XMLVersion::Version1_0);
823 match result {
824 Ok(dec_char_ref) => {
825 assert_eq!(dec_char_ref.get_as_char(), '©');
826 }
827 Err(_error) => assert!(false),
828 }
829 }
830}
831
832#[derive(Debug, PartialEq)]
838pub struct HexCharRef {
839 character: char,
840}
841impl HexCharRef {
842 fn new_unvalidated(character: char) -> HexCharRef {
846 HexCharRef { character }
847 }
848
849 pub fn new_from_string(
854 hex_code: String,
855 version: &XMLVersion,
856 ) -> Result<HexCharRef, ParseTokenError> {
857 match u32::from_str_radix(&hex_code, 16) {
858 Ok(u32_value) => match char::from_u32(u32_value) {
859 Some(c) => {
860 if is_xml_char(c, version) {
861 return Ok(HexCharRef::new_unvalidated(c));
862 }
863 }
864 None => {}
865 },
866 _ => {}
867 }
868
869 Err(ParseTokenError::new(ParseTokenErrorKind::HexCharRef))
870 }
871
872 pub fn get_as_char(&self) -> char {
874 self.character
875 }
876
877 pub fn get_as_u32(&self) -> u32 {
879 self.character as u32
880 }
881}
882#[cfg(test)]
883mod hex_char_ref_tests {
884 use super::*;
885
886 #[test]
887 fn new_from_string_test() {
888 let result = HexCharRef::new_from_string(String::from("1f61e"), &XMLVersion::Version1_0);
889 match result {
890 Ok(hex_char_ref) => {
891 assert_eq!(hex_char_ref.get_as_char(), '😞');
892 }
893 Err(_error) => assert!(false),
894 }
895 }
896}
897
898pub struct Tokenizer {
899 c: Vec<char>,
900 i: usize,
901 length: usize,
902 span_start: usize,
903 pub tokens: Vec<Token>,
904 version: XMLVersion,
905 warning_messages: Vec<String>,
906 pub error_messages: Vec<String>,
907 error: bool,
908}
909impl Tokenizer {
910 pub fn new(xml: String) -> Tokenizer {
911 let c: Vec<char> = xml.chars().collect();
912 let length = c.len();
913
914 Tokenizer {
915 c: c,
916 i: 0,
917 length: length,
918 span_start: 0,
919 tokens: Vec::new(),
920 version: XMLVersion::Version1_0,
921 warning_messages: Vec::new(),
922 error_messages: Vec::new(),
923 error: false,
924 }
925 }
926
927 pub fn tokenize_document(&mut self) -> bool {
928 self.munch_document()
929 }
930
931 fn munch_document(&mut self) -> bool {
932 self.munch_prolog() && self.munch_element() && self.munch_misc_asterisk()
933 }
934
935 fn munch_prolog(&mut self) -> bool {
936 self.munch_xml_decl_eroteme()
937 && self.munch_misc_asterisk()
938 && self.munch_doctypedecl_misc_asterisk_eroteme()
939 }
940
941 fn munch_xml_decl_eroteme(&mut self) -> bool {
942 self.munch_xml_decl();
943
944 !self.error
945 }
946
947 fn munch_xml_decl(&mut self) -> bool {
948 if !self.munch_xml_decl_start() {
949 return false;
950 }
951
952 if !self.munch_version_info() {
953 self.error("An XML declaration must have a version attribute.");
954 return false;
955 }
956
957 self.munch_encoding_decl();
958 if self.error {
959 return false;
960 }
961
962 self.munch_sd_decl();
963 if self.error {
964 return false;
965 }
966
967 self.munch_s_eroteme();
968
969 if !self.munch_xml_decl_end() {
970 self.error("An XML declaration must end with '?>'.");
971 return false;
972 } else {
973 return true;
974 }
975 }
976
977 fn munch_xml_decl_start(&mut self) -> bool {
978 if self.munch_sequence(&XML_DECL_START) {
979 self.tokens.push(Token::XMLDeclStart);
980 return true;
981 }
982
983 false
984 }
985
986 fn munch_version_info(&mut self) -> bool {
987 self.munch_s();
988
989 if !self.munch_version() {
990 return false;
991 }
992
993 if !self.munch_eq() {
994 self.error("Expected an '=' after version attribute name in XML declaration.");
995 return false;
996 }
997
998 let double_quotes: bool;
999 if self.munch_double_quote() {
1000 double_quotes = true;
1001 } else if self.munch_single_quote() {
1002 double_quotes = false;
1003 } else {
1004 self.error("Expected a single or double quote.");
1005 return false;
1006 }
1007
1008 if !self.munch_version_num() {
1009 self.error("Expected legal version number in XML declaration.");
1010 }
1011
1012 if double_quotes && !self.munch_double_quote() {
1013 self.error("Expected closing double quote following version value in XML declaration.");
1014 return false;
1015 }
1016
1017 if !double_quotes && !self.munch_single_quote() {
1018 self.error("Expected closing single quote following version value in XML declaration.");
1019 return false;
1020 }
1021
1022 true
1023 }
1024
1025 fn munch_s(&mut self) -> bool {
1026 if self.i < self.length && is_whitespace(self.c[self.i]) {
1027 self.i += 1;
1028
1029 return self.munch_s_eroteme();
1030 }
1031
1032 false
1033 }
1034
1035 fn munch_s_eroteme(&mut self) -> bool {
1036 while self.i < self.length && is_whitespace(self.c[self.i]) {
1037 self.i += 1;
1038 }
1039
1040 true
1041 }
1042
1043 fn munch_version(&mut self) -> bool {
1044 self.munch_sequence(&XML_DECL_VERSION)
1045 }
1046
1047 fn munch_eq(&mut self) -> bool {
1048 self.munch_s_eroteme() && self.munch_character(EQUALS) && self.munch_s_eroteme()
1049 }
1050
1051 fn munch_single_quote(&mut self) -> bool {
1052 self.munch_character(SINGLE_QUOTE)
1053 }
1054
1055 fn munch_double_quote(&mut self) -> bool {
1056 self.munch_character(DOUBLE_QUOTE)
1057 }
1058
1059 fn munch_version_num(&mut self) -> bool {
1060 if self.munch_sequence(&XML_DECL_VERSION_PREFIX) {
1061 self.start_span();
1062
1063 if self.munch_digits() {
1064 let span = self.get_span(0);
1065
1066 match span.as_ref() {
1067 "0" => {
1068 self.version = XMLVersion::Version1_0;
1069 self.tokens.push(Token::XMLVersion(XMLVersion::Version1_0));
1070 return true;
1071 }
1072 "1" => {
1073 self.version = XMLVersion::Version1_1;
1074 self.tokens.push(Token::XMLVersion(XMLVersion::Version1_1));
1075 return true;
1076 }
1077 _ => {
1078 self.version = XMLVersion::Version1_1;
1079 self.warning(&format!(
1080 "Unknown XML version 1.{} tokenizing as if it were version '1.1'.",
1081 span
1082 ));
1083 }
1084 }
1085 }
1086 }
1087
1088 false
1089 }
1090
1091 fn munch_digits(&mut self) -> bool {
1092 if self.i < self.length && is_digit(self.c[self.i]) {
1093 self.i += 1;
1094
1095 while self.i < self.length && is_digit(self.c[self.i]) {
1096 self.i += 1;
1097 }
1098
1099 return true;
1100 }
1101
1102 false
1103 }
1104
1105 fn munch_encoding_decl(&mut self) -> bool {
1106 self.munch_s();
1107
1108 if !self.munch_encoding() {
1109 return false;
1110 }
1111
1112 if !self.munch_eq() {
1113 self.error("Expected an '=' after encoding attribute name in XML declaration.");
1114 return false;
1115 }
1116
1117 let double_quotes: bool;
1118 if self.munch_double_quote() {
1119 double_quotes = true;
1120 } else if self.munch_single_quote() {
1121 double_quotes = false;
1122 } else {
1123 self.error("Expected a single or double quote.");
1124 return false;
1125 }
1126
1127 if !self.munch_enc_name() {
1128 self.error("Expected legal encoding value in XML declaration.");
1129 return false;
1130 }
1131
1132 if double_quotes && !self.munch_double_quote() {
1133 self.error(
1134 "Expected closing double quote following encoding value in XML declaration.",
1135 );
1136 return false;
1137 }
1138
1139 if !double_quotes && !self.munch_single_quote() {
1140 self.error(
1141 "Expected closing single quote following encoding value in XML declaration.",
1142 );
1143 return false;
1144 }
1145
1146 true
1147 }
1148
1149 fn munch_encoding(&mut self) -> bool {
1150 self.munch_sequence(&XML_DECL_ENCODING)
1151 }
1152
1153 fn munch_enc_name(&mut self) -> bool {
1154 if self.i < self.length && EncName::is_valid_start_char(self.c[self.i]) {
1155 self.start_span();
1156 self.i += 1;
1157
1158 while self.i < self.length && EncName::is_valid_char(self.c[self.i]) {
1159 self.i += 1;
1160 }
1161
1162 let span = self.get_span(0);
1163
1164 self.tokens
1165 .push(Token::XMLEncoding(EncName::new_unvalidated(span)));
1166
1167 return true;
1168 }
1169
1170 false
1171 }
1172
1173 fn munch_sd_decl(&mut self) -> bool {
1174 self.munch_s();
1175
1176 if !self.munch_standalone() {
1177 return false;
1178 }
1179
1180 if !self.munch_eq() {
1181 self.error("Expected an '=' after standalone attribute name in XML declaration.");
1182 return false;
1183 }
1184
1185 let double_quotes: bool;
1186 if self.munch_double_quote() {
1187 double_quotes = true;
1188 } else if self.munch_single_quote() {
1189 double_quotes = false;
1190 } else {
1191 self.error("Expected a single or double quote.");
1192 return false;
1193 }
1194
1195 if !self.munch_yes_no() {
1196 self.error("Expected yes or no for standalone value in XML declaration.");
1197 return false;
1198 }
1199
1200 if double_quotes && !self.munch_double_quote() {
1201 self.error(
1202 "Expected closing double quote following standalone value in XML declaration.",
1203 );
1204 return false;
1205 }
1206
1207 if !double_quotes && !self.munch_single_quote() {
1208 self.error(
1209 "Expected closing single quote following standalone value in XML declaration.",
1210 );
1211 return false;
1212 }
1213
1214 true
1215 }
1216
1217 fn munch_standalone(&mut self) -> bool {
1218 self.munch_sequence(&XML_DECL_STANDALONE)
1219 }
1220
1221 fn munch_yes_no(&mut self) -> bool {
1222 if self.munch_sequence(&YES) {
1223 self.tokens.push(Token::XMLStandalone(true));
1224 } else if self.munch_sequence(&NO) {
1225 self.tokens.push(Token::XMLStandalone(false));
1226 } else {
1227 return false;
1228 }
1229
1230 true
1231 }
1232
1233 fn munch_xml_decl_end(&mut self) -> bool {
1234 if self.munch_sequence(&XML_DECL_END) {
1235 self.tokens.push(Token::XMLDeclEnd);
1236 return true;
1237 }
1238
1239 false
1240 }
1241
1242 fn munch_misc_asterisk(&mut self) -> bool {
1243 while self.i < self.length {
1244 if !self.munch_misc() {
1245 break;
1246 }
1247 }
1248
1249 !self.error
1250 }
1251
1252 fn munch_misc(&mut self) -> bool {
1253 if self.munch_comment() {
1254 return !self.error;
1255 }
1256
1257 if self.munch_pi() {
1258 return !self.error;
1259 }
1260
1261 if self.munch_s() {
1262 return !self.error;
1263 }
1264
1265 false
1266 }
1267
1268 fn munch_comment(&mut self) -> bool {
1269 if !self.munch_sequence(&COMMENT_START) {
1270 return false;
1271 }
1272 self.start_span();
1273
1274 while self.i < self.length {
1275 if Comment::is_valid_char_minus_hyphen(self.c[self.i], &self.version) {
1276 self.i += 1;
1277 continue;
1278 } else if self.munch_sequence(&COMMENT_END) {
1279 let span = self.get_span(COMMENT_END.len());
1280 self.tokens
1281 .push(Token::Comment(Comment::new_unvalidated(span)));
1282 return true;
1283 } else if self.i + 1 < self.length
1284 && self.c[self.i] == HYPHEN
1285 && Comment::is_valid_char_minus_hyphen(self.c[self.i + 1], &self.version)
1286 {
1287 self.i += 1;
1288 continue;
1289 } else {
1290 self.error("Illegal character in comment.");
1291 return false;
1292 }
1293 }
1294
1295 self.error("Comment must end with the character sequence '-->'.");
1296 return false;
1297 }
1298
1299 fn munch_pi(&mut self) -> bool {
1300 if self.munch_sequence(&PI_START) {
1301 self.tokens.push(Token::PIStart);
1302 } else {
1303 return false;
1304 }
1305
1306 if !self.munch_pi_target() {
1307 return false;
1308 }
1309
1310 return self.munch_pi_data();
1311 }
1312
1313 fn munch_pi_target(&mut self) -> bool {
1314 if PITarget::is_valid_start_char(self.c[self.i]) {
1315 self.start_span();
1316 self.i += 1;
1317
1318 while self.i < self.length && PITarget::is_valid_char(self.c[self.i]) {
1319 self.i += 1;
1320 }
1321
1322 let pi_target = self.get_span(0);
1323 if pi_target.to_lowercase() == "xml" {
1324 self.error("Illegal processing instruction target. The string 'xml' and all case variations are reserved.");
1325 return false;
1326 }
1327 self.tokens
1328 .push(Token::PITarget(PITarget::new_unvalidated(pi_target)));
1329 return true;
1330 }
1331
1332 self.error("Illegal start character for processing instruction target.");
1333 false
1334 }
1335
1336 fn munch_pi_data(&mut self) -> bool {
1337 if self.munch_sequence(&PI_END) {
1338 self.tokens.push(Token::PIEnd);
1339 return true;
1340 } else if PIData::is_valid_start_char(self.c[self.i]) {
1341 self.start_span();
1342 self.i += 1;
1343
1344 while self.i < self.length {
1345 if self.munch_sequence(&PI_END) {
1346 let pi_data = self.get_span(PI_END.len());
1347 self.tokens
1348 .push(Token::PIData(PIData::new_unvalidated(pi_data)));
1349 self.tokens.push(Token::PIEnd);
1350 return true;
1351 } else if PIData::is_valid_char(self.c[self.i], &self.version) {
1352 self.i += 1;
1353 } else {
1354 self.error("Illegal character in processing instruction data.");
1355 return false;
1356 }
1357 }
1358
1359 self.error("Processing instruction must end with the '?>' character sequence.");
1360 return false;
1361 } else {
1362 self.error("Illegal start character in processing instruction data.");
1363 false
1364 }
1365 }
1366
1367 fn munch_doctypedecl_misc_asterisk_eroteme(&mut self) -> bool {
1368 if !self.munch_doctypedecl() {
1369 return !self.error;
1370 }
1371
1372 return self.munch_misc_asterisk();
1373 }
1374
1375 fn munch_doctypedecl(&mut self) -> bool {
1376 if !self.munch_sequence(&DOCTYPE_START) {
1377 return false;
1378 }
1379
1380 if !self.munch_s() {
1381 self.error("Doctypedecl must have an S following Doctype start.");
1382 return false;
1383 }
1384
1385 if !self.munch_doctype_name() {
1386 self.error("Doctypedecl must have a doctype name.");
1387 return false;
1388 }
1389
1390 if self.munch_character(DOCTYPE_END) {
1394 self.tokens.push(Token::DoctypeDeclEnd);
1395 return true;
1396 } else {
1397 self.error("Doctypedecl must end in a '>' character.");
1398 return false;
1399 }
1400 }
1401
1402 fn munch_doctype_name(&mut self) -> bool {
1403 if Name::is_valid_start_char(self.c[self.i]) {
1404 self.start_span();
1405 self.i += 1;
1406
1407 while self.i < self.length {
1408 if Name::is_valid_char(self.c[self.i]) {
1409 self.i += 1;
1410 } else {
1411 let span = self.get_span(0);
1412 self.tokens.push(Token::DoctypeDeclStart);
1413 self.tokens.push(Token::DoctypeName(Name { name: span }));
1414 return true;
1415 }
1416 }
1417
1418 self.error("Doctype declaration must end with a '>' character.");
1419 return false;
1420 }
1421
1422 self.error("Illegal first character of doctype name.");
1423 false
1424 }
1425
1426 fn munch_element(&mut self) -> bool {
1427 if !self.munch_character(STAG_START) {
1428 return false;
1429 }
1430
1431 if !self.munch_element_name() {
1432 self.error("Expected an element name.");
1433 return false;
1434 }
1435
1436 self.munch_s_attibute_asterisk();
1437 if self.error {
1438 return false;
1439 }
1440
1441 if self.munch_empty_element_end() {
1442 return !self.error;
1443 }
1444
1445 if self.munch_character(STAG_END) {
1446 self.tokens.push(Token::ElementSTagEnd);
1447 } else {
1448 self.error("Expected end of STag.");
1449 return false;
1450 }
1451
1452 return self.munch_content();
1453 }
1454
1455 fn munch_element_name(&mut self) -> bool {
1456 if QName::is_valid_start_char(self.c[self.i]) {
1457 self.start_span();
1458 self.i += 1;
1459 let mut prefix_defined = false;
1460 let mut prefix = String::new();
1461
1462 while self.i < self.length {
1463 if QName::is_valid_char(self.c[self.i]) {
1464 self.i += 1;
1465 } else if self.c[self.i] == COLON {
1466 prefix = self.get_span(0);
1467 prefix_defined = true;
1468 self.i += 1;
1469 self.start_span();
1470 } else {
1471 let local_part = self.get_span(0);
1472 if prefix_defined {
1473 self.tokens.push(Token::ElementStart(QName::new_unvalidated(
1474 Some(prefix),
1475 local_part,
1476 )));
1477 } else {
1478 self.tokens.push(Token::ElementStart(QName::new_unvalidated(
1479 None, local_part,
1480 )));
1481 }
1482 return true;
1483 }
1484 }
1485
1486 self.error("Premature end of input in element tag.");
1487 return false;
1488 }
1489
1490 self.error("Expected name start character.");
1491 false
1492 }
1493
1494 fn munch_s_attibute_asterisk(&mut self) -> bool {
1495 while self.i < self.length {
1496 if self.munch_s() && self.munch_attribute() {
1497 continue;
1498 } else {
1499 break;
1500 }
1501 }
1502
1503 !self.error
1504 }
1505
1506 fn munch_empty_element_end(&mut self) -> bool {
1507 if self.munch_sequence(&EMPTY_TAG_END) {
1508 self.tokens.push(Token::ElementEmptyEnd);
1509 return true;
1510 }
1511
1512 false
1513 }
1514
1515 fn munch_content(&mut self) -> bool {
1516 while self.i < self.length {
1517 if self.munch_etag() {
1518 return true;
1519 } else if self.munch_cd_sect() {
1520 continue;
1521 } else if self.munch_comment() {
1522 continue;
1523 } else if self.munch_pi() {
1524 continue;
1525 } else if self.munch_element() {
1526 continue;
1527 } else if self.munch_reference() {
1528 continue;
1529 } else if self.munch_char_data() {
1530 continue;
1531 } else {
1532 return false;
1533 }
1534 }
1535
1536 !self.error
1537 }
1538
1539 fn munch_char_data(&mut self) -> bool {
1540 if self.i < self.length && Text::is_valid_char(self.c[self.i]) {
1541 self.start_span();
1542 self.i += 1;
1543
1544 while self.i < self.length {
1545 if self.munch_sequence(&CDATA_END) {
1546 self.error("Illegal CDATA END in character data.");
1547 return false;
1548 } else if Text::is_valid_char(self.c[self.i]) {
1549 self.i += 1;
1550 } else {
1551 break;
1552 }
1553 }
1554
1555 let span = self.get_span(0);
1556 self.tokens.push(Token::Text(Text::new_unvalidated(span)));
1557 return true;
1558 }
1559
1560 false
1561 }
1562
1563 fn munch_cd_sect(&mut self) -> bool {
1564 if self.munch_sequence(&CDATA_START) {
1565 self.start_span();
1566
1567 while self.i < self.length {
1568 if self.munch_sequence(&CDATA_END) {
1569 let span = self.get_span(CDATA_END.len());
1570 self.tokens
1571 .push(Token::CDATASection(CDATASection::new_unvalidated(span)));
1572 return true;
1573 } else if CDATASection::is_valid_char(self.c[self.i], &self.version) {
1574 self.i += 1;
1575 } else {
1576 self.error("Illegal character in CDATA Section.");
1577 return false;
1578 }
1579 }
1580
1581 self.error("CDATA Section must end with a ']]>' character sequence.");
1582 return false;
1583 }
1584
1585 false
1586 }
1587
1588 fn munch_reference(&mut self) -> bool {
1589 if self.munch_char_ref() {
1590 return !self.error;
1591 } else if self.munch_entity_ref() {
1592 return !self.error;
1593 } else {
1594 return false;
1595 }
1596 }
1597
1598 fn munch_char_ref(&mut self) -> bool {
1599 if self.munch_hexidecimal_char_ref() {
1600 return !self.error;
1601 } else if self.munch_decimal_char_ref() {
1602 return !self.error;
1603 } else {
1604 return false;
1605 }
1606 }
1607
1608 fn munch_hexidecimal_char_ref(&mut self) -> bool {
1609 if !self.munch_sequence(&HEXIDECIMAL_CHAR_REFERENCE_START) {
1610 return false;
1611 }
1612 self.start_span();
1613
1614 while self.i < self.length {
1615 if is_hexidecimal_digit(self.c[self.i]) {
1616 self.i += 1;
1617 } else if self.munch_character(REFERENCE_END) {
1618 match HexCharRef::new_from_string(self.get_span(1), &self.version) {
1619 Ok(hex_char_ref) => {
1620 self.tokens.push(Token::HexCharRef(hex_char_ref));
1621 return true;
1622 }
1623 Err(parse_char_ref_err) => {
1624 self.error(
1625 "Failed to parse hexidecimal character reference to a character.",
1626 );
1627 self.error(parse_char_ref_err.message());
1628 return false;
1629 }
1630 }
1631 } else {
1632 self.error("Illegal character in hexidecimal character reference.");
1633 return false;
1634 }
1635 }
1636
1637 self.error("Expected a ';' character to terminate the hexidecimal character reference.");
1638 false
1639 }
1640
1641 fn munch_decimal_char_ref(&mut self) -> bool {
1642 if !self.munch_sequence(&DECIMAL_CHAR_REFERENCE_START) {
1643 return false;
1644 }
1645 self.start_span();
1646
1647 while self.i < self.length {
1648 if is_digit(self.c[self.i]) {
1649 self.i += 1;
1650 } else if self.munch_character(REFERENCE_END) {
1651 match DecCharRef::new_from_string(self.get_span(1), &self.version) {
1652 Ok(dec_char_ref) => {
1653 self.tokens.push(Token::DecCharRef(dec_char_ref));
1654 return true;
1655 }
1656 Err(parse_char_ref_err) => {
1657 self.error(
1658 "Failed to parse decidecimal character reference to a character.",
1659 );
1660 self.error(parse_char_ref_err.message());
1661 return false;
1662 }
1663 }
1664 } else {
1665 self.error("Illegal character in decidecimal character reference.");
1666 return false;
1667 }
1668 }
1669
1670 self.error("Expected a ';' character to terminate the decidecimal character reference.");
1671 false
1672 }
1673
1674 fn munch_entity_ref(&mut self) -> bool {
1675 if self.munch_character(ENTITY_REFERENCE_START) {
1676 self.start_span();
1677
1678 if Name::is_valid_start_char(self.c[self.i]) {
1679 self.i += 1;
1680 } else {
1681 self.error("Expected a legal name start character in entity reference.");
1682 return false;
1683 }
1684
1685 while self.i < self.length {
1686 if Name::is_valid_char(self.c[self.i]) {
1687 self.i += 1;
1688 } else if self.munch_character(REFERENCE_END) {
1689 let span = self.get_span(1);
1690 self.tokens
1691 .push(Token::EntityRef(Name::new_unvalidated(span)));
1692 return true;
1693 } else {
1694 self.error("Illegal character in entity reference name.");
1695 return false;
1696 }
1697 }
1698
1699 self.error("Entity reference must end with a ';' character.");
1700 return false;
1701 }
1702
1703 false
1704 }
1705
1706 fn munch_etag(&mut self) -> bool {
1707 if !self.munch_sequence(&ETAG_START) {
1708 return false;
1709 }
1710
1711 if !QName::is_valid_start_char(self.c[self.i]) {
1712 self.error("Expected a qname start character after ETag start.");
1713 return false;
1714 }
1715
1716 let mut prefix_defined = false;
1717 let mut prefix = String::new();
1718 self.start_span();
1719 self.i += 1;
1720
1721 while self.i < self.length {
1722 if QName::is_valid_char(self.c[self.i]) {
1723 self.i += 1;
1724 } else if self.c[self.i] == COLON {
1725 prefix = self.get_span(0);
1726 prefix_defined = true;
1727 self.i += 1;
1728 self.start_span();
1729 } else if self.munch_character(ETAG_END) {
1730 let local_part = self.get_span(1);
1731 if prefix_defined {
1732 self.tokens.push(Token::ElementEnd(QName::new_unvalidated(
1733 Some(prefix),
1734 local_part,
1735 )));
1736 } else {
1737 self.tokens
1738 .push(Token::ElementEnd(QName::new_unvalidated(None, local_part)));
1739 }
1740 return true;
1741 } else {
1742 self.error("Illegal character in ETag name.");
1743 return false;
1744 }
1745 }
1746
1747 self.error("ETag must finish with a closing '>' character.");
1748 false
1749 }
1750
1751 fn munch_attribute(&mut self) -> bool {
1752 if self.munch_namespace() {
1753 return true;
1754 }
1755
1756 if !self.munch_attribute_name() {
1757 return false;
1758 }
1759
1760 if !self.munch_eq() {
1761 self.error("Expected an '=' character after an attribute name.");
1762 return false;
1763 }
1764
1765 if !self.munch_attribute_value() {
1766 self.error("Expected an attribute value.");
1767 return false;
1768 }
1769
1770 true
1771 }
1772
1773 fn munch_namespace(&mut self) -> bool {
1774 if self.munch_sequence(&XMLNS) {
1775 self.tokens.push(Token::NamespaceStart);
1776 } else {
1777 return false;
1778 }
1779
1780 if self.munch_character(COLON) {
1781 if !self.munch_namespace_prefix() {
1782 self.error("Expected a namespace prefix after the character sequence 'xmlns:'.");
1783 return false;
1784 }
1785 } else {
1786 self.tokens.push(Token::NamespaceDefault);
1787 }
1788
1789 if !self.munch_eq() {
1790 self.error("Expected an '=' character after a namespace attribute name.");
1791 return false;
1792 }
1793
1794 self.munch_namespace_value()
1795 }
1796
1797 fn munch_namespace_prefix(&mut self) -> bool {
1798 if NCName::is_valid_start_char(self.c[self.i]) {
1799 self.start_span();
1800 self.i += 1;
1801
1802 while self.i < self.length {
1803 if NCName::is_valid_char(self.c[self.i]) {
1804 self.i += 1;
1805 } else {
1806 break;
1807 }
1808 }
1809
1810 let prefix = self.get_span(0);
1811 self.tokens
1812 .push(Token::NamespacePrefix(NCName::new_unvalidated(prefix)));
1813 return true;
1814 }
1815
1816 false
1817 }
1818
1819 fn munch_namespace_value(&mut self) -> bool {
1820 let double_quotes: bool;
1821 if self.munch_double_quote() {
1822 double_quotes = true;
1823 } else if self.munch_single_quote() {
1824 double_quotes = false;
1825 } else {
1826 self.error("Expected a single or double quote.");
1827 return false;
1828 }
1829
1830 self.start_span();
1831
1832 while self.i < self.length {
1833 if double_quotes && self.munch_character(DOUBLE_QUOTE)
1834 || !double_quotes && self.munch_character(SINGLE_QUOTE)
1835 {
1836 let span = self.get_span(1);
1837 self.tokens
1838 .push(Token::NamespaceValue(NamespaceValue::new_unvalidated(span)));
1839 self.tokens.push(Token::NamespaceEnd);
1840 return true;
1841 } else if NamespaceValue::is_valid_char(self.c[self.i], &self.version) {
1842 self.i += 1;
1843 } else {
1844 self.error("Illegal character in namespace value.");
1845 return false;
1846 }
1847 }
1848
1849 if double_quotes {
1850 self.error("Expected closing double quote following namespace value.");
1851 } else {
1852 self.error("Expected closing single quote following namespace value.");
1853 }
1854
1855 false
1856 }
1857
1858 fn munch_attribute_name(&mut self) -> bool {
1859 if QName::is_valid_start_char(self.c[self.i]) {
1860 let mut prefix_defined = false;
1861 let mut prefix = String::new();
1862 self.start_span();
1863 self.i += 1;
1864
1865 while self.i < self.length {
1866 if QName::is_valid_char(self.c[self.i]) {
1867 self.i += 1;
1868 } else if self.c[self.i] == COLON {
1869 prefix = self.get_span(0);
1870 prefix_defined = true;
1871 self.i += 1;
1872 self.start_span();
1873 } else {
1874 break;
1875 }
1876 }
1877
1878 let local_part = self.get_span(0);
1879 if prefix_defined {
1880 self.tokens.push(Token::AttributeStart);
1881 self.tokens
1882 .push(Token::AttributeName(QName::new_unvalidated(
1883 Some(prefix),
1884 local_part,
1885 )));
1886 } else {
1887 self.tokens.push(Token::AttributeStart);
1888 self.tokens
1889 .push(Token::AttributeName(QName::new_unvalidated(
1890 None, local_part,
1891 )));
1892 }
1893 return true;
1894 }
1895
1896 false
1897 }
1898
1899 fn munch_attribute_value(&mut self) -> bool {
1900 let double_quotes: bool;
1901 if self.munch_double_quote() {
1902 double_quotes = true;
1903 } else if self.munch_single_quote() {
1904 double_quotes = false;
1905 } else {
1906 self.error("Expected a single or double quote.");
1907 return false;
1908 }
1909
1910 self.tokens.push(Token::AttributeValueStart);
1911 self.start_span();
1912
1913 while self.i < self.length {
1914 if double_quotes && AttributeValue::is_valid_inside_double_quotes_char(self.c[self.i])
1915 || !double_quotes
1916 && AttributeValue::is_valid_inside_single_quotes_char(self.c[self.i])
1917 {
1918 self.i += 1;
1919 } else if self.c[self.i] == ENTITY_REFERENCE_START {
1920 let span = self.get_span(0);
1921 self.tokens
1922 .push(Token::AttributeValue(AttributeValue::new_unvalidated(span)));
1923 self.munch_reference();
1924 self.start_span();
1925 } else if (double_quotes && self.munch_double_quote())
1926 || (!double_quotes && self.munch_single_quote())
1927 {
1928 let span = self.get_span(1);
1929 self.tokens
1930 .push(Token::AttributeValue(AttributeValue::new_unvalidated(span)));
1931 self.tokens.push(Token::AttributeEnd);
1932 return true;
1933 } else {
1934 self.error("Illegal character in attribute value.");
1935 return false;
1936 }
1937 }
1938
1939 if double_quotes {
1940 self.error("Expected closing double quote following attribute value.");
1941 } else {
1942 self.error("Expected closing single quote following attriubte value.");
1943 }
1944
1945 false
1946 }
1947
1948 fn munch_character(&mut self, character: char) -> bool {
1949 if self.i < self.length && self.c[self.i] == character {
1950 self.i += 1;
1951 return true;
1952 }
1953
1954 false
1955 }
1956
1957 fn munch_sequence(&mut self, sequence: &[char]) -> bool {
1958 let sequence_length = sequence.len();
1959 let sequence_end = self.i + sequence_length;
1960
1961 if sequence_end > self.length {
1962 return false;
1963 }
1964
1965 if &self.c[self.i..sequence_end] == sequence {
1966 self.i += sequence_length;
1967 return true;
1968 }
1969
1970 false
1971 }
1972
1973 fn start_span(&mut self) {
1974 self.span_start = self.i;
1975 }
1976
1977 fn get_span(&mut self, span_end_offset: usize) -> String {
1978 let span_end = self.i - span_end_offset;
1979
1980 self.c[self.span_start..span_end].iter().collect()
1981 }
1982
1983 fn warning(&mut self, msg: &str) {
1984 if self.i < self.length {
1985 self.warning_messages
1986 .push(format!("c[{}]={} {}", self.i, self.c[self.i], msg));
1987 } else {
1988 self.warning_messages
1989 .push(format!("Out of bounds: {}", msg));
1990 }
1991 }
1992
1993 fn error(&mut self, msg: &str) {
1994 if self.i < self.length {
1995 self.error_messages
1996 .push(format!("c[{}]={} {}", self.i, self.c[self.i], msg));
1997 } else {
1998 self.error_messages.push(format!("Out of bounds: {}", msg));
1999 }
2000 }
2001}
2002#[cfg(test)]
2003mod tokenizer_tests {
2004 use super::*;
2005
2006 #[test]
2007 fn tokenize_document_hit() {
2008 let mut tok = Tokenizer::new(String::from("<a/>"));
2009 assert!(tok.tokenize_document());
2010 assert_eq!(
2011 tok.tokens,
2012 vec![
2013 Token::ElementStart(QName::new_unvalidated(None, String::from("a"))),
2014 Token::ElementEmptyEnd
2015 ]
2016 );
2017 assert!(!tok.error);
2018 }
2019
2020 #[test]
2021 fn munch_document_hit() {
2022 let mut tok = Tokenizer::new(String::from("<a/>"));
2023 assert!(tok.munch_document());
2024 assert_eq!(
2025 tok.tokens,
2026 vec![
2027 Token::ElementStart(QName::new_unvalidated(None, String::from("a"))),
2028 Token::ElementEmptyEnd
2029 ]
2030 );
2031 assert!(!tok.error);
2032 }
2033
2034 #[test]
2035 fn munch_prolog_hit() {
2036 let mut tok = Tokenizer::new(String::from(""));
2037 assert!(tok.munch_prolog());
2038 assert_eq!(tok.tokens, vec![]);
2039 assert!(!tok.error);
2040
2041 }
2043
2044 #[test]
2045 fn munch_xml_decl_eroteme_hit() {
2046 let mut tok = Tokenizer::new(String::from(""));
2047 assert!(tok.munch_xml_decl_eroteme());
2048 assert_eq!(tok.tokens, vec![]);
2049 assert!(!tok.error);
2050 }
2051
2052 #[test]
2053 fn munch_xml_decl_hit() {
2054 let mut tok = Tokenizer::new(String::from("<?xml version='1.0'?>"));
2055 assert!(tok.munch_xml_decl());
2056 assert_eq!(
2057 tok.tokens,
2058 vec![
2059 Token::XMLDeclStart,
2060 Token::XMLVersion(XMLVersion::Version1_0),
2061 Token::XMLDeclEnd
2062 ]
2063 );
2064 assert!(!tok.error);
2065
2066 let mut tok = Tokenizer::new(String::from("<?xml version='1.0' encoding='utf-8'?>"));
2067 assert!(tok.munch_xml_decl());
2068 assert_eq!(
2069 tok.tokens,
2070 vec![
2071 Token::XMLDeclStart,
2072 Token::XMLVersion(XMLVersion::Version1_0),
2073 Token::XMLEncoding(EncName::new_unvalidated(String::from("utf-8"))),
2074 Token::XMLDeclEnd
2075 ]
2076 );
2077 assert!(!tok.error);
2078
2079 let mut tok = Tokenizer::new(String::from("<?xml version='1.0' standalone='yes'?>"));
2080 tok.munch_xml_decl();
2081 assert_eq!(
2082 tok.tokens,
2083 vec![
2084 Token::XMLDeclStart,
2085 Token::XMLVersion(XMLVersion::Version1_0),
2086 Token::XMLStandalone(true),
2087 Token::XMLDeclEnd
2088 ]
2089 );
2090 assert!(!tok.error);
2091
2092 let mut tok = Tokenizer::new(String::from(
2093 "<?xml version='1.0' encoding='utf-8' standalone='no'?>",
2094 ));
2095 assert!(tok.munch_xml_decl());
2096 assert_eq!(
2097 tok.tokens,
2098 vec![
2099 Token::XMLDeclStart,
2100 Token::XMLVersion(XMLVersion::Version1_0),
2101 Token::XMLEncoding(EncName::new_unvalidated(String::from("utf-8"))),
2102 Token::XMLStandalone(false),
2103 Token::XMLDeclEnd
2104 ]
2105 );
2106 assert!(!tok.error);
2107
2108 let mut tok =
2109 Tokenizer::new(String::from("<?xml version = \"1.0\" encoding = \"utf-8\" standalone = \"no\" ?>"));
2110 assert!(tok.munch_xml_decl());
2111 assert_eq!(
2112 tok.tokens,
2113 vec![
2114 Token::XMLDeclStart,
2115 Token::XMLVersion(XMLVersion::Version1_0),
2116 Token::XMLEncoding(EncName::new_unvalidated(String::from("utf-8"))),
2117 Token::XMLStandalone(false),
2118 Token::XMLDeclEnd
2119 ]
2120 );
2121 assert!(!tok.error);
2122 }
2123
2124 #[test]
2125 fn munch_xml_decl_start_hit() {
2126 let mut tok = Tokenizer::new(String::from("<?xml"));
2127 assert!(tok.munch_xml_decl_start());
2128 assert_eq!(tok.tokens, vec![Token::XMLDeclStart,]);
2129 assert!(!tok.error);
2130 }
2131
2132 #[test]
2133 fn munch_version_info_hit() {
2134 let mut tok = Tokenizer::new(String::from(" version='1.0'"));
2135 assert!(tok.munch_version_info());
2136 assert_eq!(tok.tokens, vec![Token::XMLVersion(XMLVersion::Version1_0),]);
2137 assert!(!tok.error);
2138 }
2139
2140 #[test]
2141 fn munch_s_hit() {
2142 let mut tok = Tokenizer::new(String::from(" "));
2143 assert!(tok.munch_s());
2144 assert_eq!(tok.tokens, vec![]);
2145 assert!(!tok.error);
2146 }
2147
2148 #[test]
2149 fn munch_s_eroteme_hit() {
2150 let mut tok = Tokenizer::new(String::from(""));
2151 assert!(tok.munch_s_eroteme());
2152 assert_eq!(tok.tokens, vec![]);
2153 assert!(!tok.error);
2154
2155 let mut tok = Tokenizer::new(String::from(" \n \t \r "));
2156 assert!(tok.munch_s_eroteme());
2157 assert_eq!(tok.tokens, vec![]);
2158 assert!(!tok.error);
2159 }
2160
2161 #[test]
2162 fn munch_version_hit() {
2163 let mut tok = Tokenizer::new(String::from("version"));
2164 assert!(tok.munch_version());
2165 assert_eq!(tok.tokens, vec![]);
2166 assert!(!tok.error);
2167 }
2168
2169 #[test]
2170 fn munch_eq_hit() {
2171 let mut tok = Tokenizer::new(String::from("="));
2172 assert!(tok.munch_eq());
2173 assert_eq!(tok.tokens, vec![]);
2174 assert!(!tok.error);
2175
2176 let mut tok = Tokenizer::new(String::from(" = "));
2177 assert!(tok.munch_eq());
2178 assert_eq!(tok.tokens, vec![]);
2179 assert!(!tok.error);
2180 }
2181
2182 #[test]
2183 fn munch_single_quote_hit() {
2184 let mut tok = Tokenizer::new(String::from("'"));
2185 assert!(tok.munch_single_quote());
2186 assert_eq!(tok.tokens, vec![]);
2187 assert!(!tok.error);
2188 }
2189
2190 #[test]
2191 fn munch_double_quote_hit() {
2192 let mut tok = Tokenizer::new(String::from("\""));
2193 assert!(tok.munch_double_quote());
2194 assert_eq!(tok.tokens, vec![]);
2195 assert!(!tok.error);
2196 }
2197
2198 #[test]
2199 fn munch_version_num_hit() {
2200 let mut tok = Tokenizer::new(String::from("1.0"));
2201 assert!(tok.munch_version_num());
2202 assert_eq!(tok.tokens, vec![Token::XMLVersion(XMLVersion::Version1_0)]);
2203 assert!(!tok.error);
2204
2205 let mut tok = Tokenizer::new(String::from("1.1"));
2206 assert!(tok.munch_version_num());
2207 assert_eq!(tok.tokens, vec![Token::XMLVersion(XMLVersion::Version1_1)]);
2208 assert!(!tok.error);
2209 }
2210
2211 #[test]
2212 fn munch_digits_hit() {
2213 let mut tok = Tokenizer::new(String::from("1234567890"));
2214 assert!(tok.munch_digits());
2215 assert_eq!(tok.tokens, vec![]);
2216 assert!(!tok.error);
2217 }
2218
2219 #[test]
2220 fn munch_encoding_decl_hit() {
2221 let mut tok = Tokenizer::new(String::from(" encoding='utf-8'"));
2222 assert!(tok.munch_encoding_decl());
2223 assert_eq!(
2224 tok.tokens,
2225 vec![Token::XMLEncoding(EncName::new_unvalidated(String::from(
2226 "utf-8"
2227 ))),]
2228 );
2229 assert!(!tok.error);
2230
2231 let mut tok = Tokenizer::new(String::from(" encoding = \"UTF-8\" "));
2232 assert!(tok.munch_encoding_decl());
2233 assert_eq!(
2234 tok.tokens,
2235 vec![Token::XMLEncoding(EncName::new_unvalidated(String::from(
2236 "UTF-8"
2237 ))),]
2238 );
2239 assert!(!tok.error);
2240 }
2241
2242 #[test]
2243 fn munch_encoding_hit() {
2244 let mut tok = Tokenizer::new(String::from("encoding"));
2245 assert!(tok.munch_encoding());
2246 assert_eq!(tok.tokens, vec![]);
2247 assert!(!tok.error);
2248 }
2249
2250 #[test]
2251 fn munch_enc_name_hit() {
2252 let mut tok = Tokenizer::new(String::from("iso8859-1"));
2253 assert!(tok.munch_enc_name());
2254 assert_eq!(
2255 tok.tokens,
2256 vec![Token::XMLEncoding(EncName::new_unvalidated(String::from(
2257 "iso8859-1"
2258 )))]
2259 );
2260 assert!(!tok.error);
2261 }
2262
2263 #[test]
2264 fn munch_sd_decl_hit() {
2265 let mut tok = Tokenizer::new(String::from(" standalone='yes'"));
2266 assert!(tok.munch_sd_decl());
2267 assert_eq!(tok.tokens, vec![Token::XMLStandalone(true)]);
2268 assert!(!tok.error);
2269 }
2270
2271 #[test]
2272 fn munch_standalone_hit() {
2273 let mut tok = Tokenizer::new(String::from("standalone"));
2274 assert!(tok.munch_standalone());
2275 assert_eq!(tok.tokens, vec![]);
2276 assert!(!tok.error);
2277 }
2278
2279 #[test]
2280 fn munch_yes_no_hit() {
2281 let mut tok = Tokenizer::new(String::from("yes"));
2282 assert!(tok.munch_yes_no());
2283 assert_eq!(tok.tokens, vec![Token::XMLStandalone(true)]);
2284 assert!(!tok.error);
2285
2286 let mut tok = Tokenizer::new(String::from("no"));
2287 assert!(tok.munch_yes_no());
2288 assert_eq!(tok.tokens, vec![Token::XMLStandalone(false)]);
2289 assert!(!tok.error);
2290 }
2291
2292 #[test]
2293 fn munch_xml_decl_end_hit() {
2294 let mut tok = Tokenizer::new(String::from("?>"));
2295 assert!(tok.munch_xml_decl_end());
2296 assert_eq!(tok.tokens, vec![Token::XMLDeclEnd]);
2297 assert!(!tok.error);
2298 }
2299
2300 #[test]
2301 fn munch_misc_asterisk_hit() {
2302 let mut tok = Tokenizer::new(String::from(""));
2303 assert!(tok.munch_misc_asterisk());
2304 assert_eq!(tok.tokens, vec![]);
2305 assert!(!tok.error);
2306
2307 }
2309
2310 #[test]
2311 fn munch_misc_hit() {
2312 let mut tok = Tokenizer::new(String::from(" "));
2313 assert!(tok.munch_misc());
2314 assert_eq!(tok.tokens, vec![]);
2315 assert!(!tok.error);
2316
2317 }
2319
2320 #[test]
2321 fn munch_comment_hit() {
2322 let mut tok = Tokenizer::new(String::from("<!---->"));
2323 assert!(tok.munch_comment());
2324 assert_eq!(
2325 tok.tokens,
2326 vec![Token::Comment(Comment::new_unvalidated(String::from("")))]
2327 );
2328 assert!(!tok.error);
2329
2330 let mut tok = Tokenizer::new(String::from("<!--My comment text-->"));
2331 assert!(tok.munch_comment());
2332 assert_eq!(
2333 tok.tokens,
2334 vec![Token::Comment(Comment::new_unvalidated(String::from(
2335 "My comment text"
2336 )))]
2337 );
2338 assert!(!tok.error);
2339
2340 let mut tok = Tokenizer::new(String::from("<!-- My - comment - text -->"));
2341 assert!(tok.munch_comment());
2342 assert_eq!(
2343 tok.tokens,
2344 vec![Token::Comment(Comment::new_unvalidated(String::from(
2345 " My - comment - text "
2346 )))]
2347 );
2348 assert!(!tok.error);
2349 }
2350
2351 #[test]
2352 fn munch_pi_hit() {
2353 let mut tok = Tokenizer::new(String::from("<?mypi my pi data?>"));
2354 assert!(tok.munch_pi());
2355 assert_eq!(
2356 tok.tokens,
2357 vec![
2358 Token::PIStart,
2359 Token::PITarget(PITarget {
2360 target: String::from("mypi")
2361 }),
2362 Token::PIData(PIData::new_unvalidated(String::from(" my pi data"))),
2363 Token::PIEnd,
2364 ]
2365 );
2366 assert!(!tok.error);
2367 }
2368
2369 #[test]
2370 fn munch_pi_target() {
2371 let mut tok = Tokenizer::new(String::from("mypi"));
2372 assert!(tok.munch_pi_target());
2373 assert_eq!(
2374 tok.tokens,
2375 vec![Token::PITarget(PITarget {
2376 target: String::from("mypi")
2377 }),]
2378 );
2379 assert!(!tok.error);
2380 }
2381
2382 #[test]
2383 fn munch_pi_data() {
2384 let mut tok = Tokenizer::new(String::from("?>"));
2385 assert!(tok.munch_pi_data());
2386 assert_eq!(tok.tokens, vec![Token::PIEnd]);
2387 assert!(!tok.error);
2388
2389 let mut tok = Tokenizer::new(String::from(" Valid PI data is empty or starts with S.?>"));
2390 assert!(tok.munch_pi_data());
2391 assert_eq!(
2392 tok.tokens,
2393 vec![
2394 Token::PIData(PIData {
2395 data: String::from(" Valid PI data is empty or starts with S.")
2396 }),
2397 Token::PIEnd
2398 ]
2399 );
2400 assert!(!tok.error);
2401 }
2402
2403 #[test]
2404 fn munch_doctypedecl_misc_asterisk_eroteme_hit() {
2405 let mut tok = Tokenizer::new(String::from(""));
2406 assert!(tok.munch_doctypedecl_misc_asterisk_eroteme());
2407 assert_eq!(tok.tokens, vec![]);
2408 assert!(!tok.error);
2409
2410 let mut tok = Tokenizer::new(String::from("<!DOCTYPE html>"));
2411 assert!(tok.munch_doctypedecl_misc_asterisk_eroteme());
2412 assert_eq!(
2413 tok.tokens,
2414 vec![
2415 Token::DoctypeDeclStart,
2416 Token::DoctypeName(Name::new_unvalidated(String::from("html"))),
2417 Token::DoctypeDeclEnd,
2418 ]
2419 );
2420 assert!(!tok.error);
2421
2422 let mut tok = Tokenizer::new(String::from("<!DOCTYPE html>"));
2423 assert!(tok.munch_doctypedecl_misc_asterisk_eroteme());
2424 assert_eq!(
2425 tok.tokens,
2426 vec![
2427 Token::DoctypeDeclStart,
2428 Token::DoctypeName(Name::new_unvalidated(String::from("html"))),
2429 Token::DoctypeDeclEnd,
2430 ]
2431 );
2432 assert!(!tok.error);
2433
2434 let mut tok = Tokenizer::new(String::from("<!DOCTYPE html> <!--Comment-->"));
2435 assert!(tok.munch_doctypedecl_misc_asterisk_eroteme());
2436 assert_eq!(
2437 tok.tokens,
2438 vec![
2439 Token::DoctypeDeclStart,
2440 Token::DoctypeName(Name::new_unvalidated(String::from("html"))),
2441 Token::DoctypeDeclEnd,
2442 Token::Comment(Comment::new_unvalidated(String::from("Comment"))),
2443 ]
2444 );
2445 assert!(!tok.error);
2446
2447 }
2449
2450 #[test]
2451 fn munch_doctypedecl_hit() {
2452 let mut tok = Tokenizer::new(String::from("<!DOCTYPE html>"));
2453 assert!(tok.munch_doctypedecl());
2454 assert_eq!(
2455 tok.tokens,
2456 vec![
2457 Token::DoctypeDeclStart,
2458 Token::DoctypeName(Name::new_unvalidated(String::from("html"))),
2459 Token::DoctypeDeclEnd,
2460 ]
2461 );
2462 assert!(!tok.error);
2463
2464 }
2466
2467 #[test]
2468 fn munch_element_hit() {
2469 let mut tok = Tokenizer::new(String::from("<a/>"));
2470 assert!(tok.munch_element());
2471 assert_eq!(
2472 tok.tokens,
2473 vec![
2474 Token::ElementStart(QName::new_unvalidated(None, String::from("a"))),
2475 Token::ElementEmptyEnd
2476 ]
2477 );
2478 assert!(!tok.error);
2479
2480 }
2482
2483 #[test]
2484 fn munch_element_name_hit() {
2485 let mut tok = Tokenizer::new(String::from("emptyelementname/>"));
2486 assert!(tok.munch_element_name());
2487 assert_eq!(
2488 tok.tokens,
2489 vec![Token::ElementStart(QName::new_unvalidated(
2490 None,
2491 String::from("emptyelementname")
2492 )),]
2493 );
2494 assert!(!tok.error);
2495
2496 let mut tok = Tokenizer::new(String::from("emptyelementname />"));
2497 assert!(tok.munch_element_name());
2498 assert_eq!(
2499 tok.tokens,
2500 vec![Token::ElementStart(QName::new_unvalidated(
2501 None,
2502 String::from("emptyelementname")
2503 )),]
2504 );
2505 assert!(!tok.error);
2506
2507 let mut tok = Tokenizer::new(String::from("validname>"));
2508 assert!(tok.munch_element_name());
2509 assert_eq!(
2510 tok.tokens,
2511 vec![Token::ElementStart(QName::new_unvalidated(
2512 None,
2513 String::from("validname")
2514 )),]
2515 );
2516 assert!(!tok.error);
2517
2518 let mut tok = Tokenizer::new(String::from("prefix:emptyelementname/>"));
2519 assert!(tok.munch_element_name());
2520 assert_eq!(
2521 tok.tokens,
2522 vec![Token::ElementStart(QName::new_unvalidated(
2523 Some(String::from("prefix")),
2524 String::from("emptyelementname")
2525 )),]
2526 );
2527 assert!(!tok.error);
2528
2529 let mut tok = Tokenizer::new(String::from("prefix:emptyelementname />"));
2530 assert!(tok.munch_element_name());
2531 assert_eq!(
2532 tok.tokens,
2533 vec![Token::ElementStart(QName::new_unvalidated(
2534 Some(String::from("prefix")),
2535 String::from("emptyelementname")
2536 )),]
2537 );
2538 assert!(!tok.error);
2539
2540 let mut tok = Tokenizer::new(String::from("prefix:validname>"));
2541 assert!(tok.munch_element_name());
2542 assert_eq!(
2543 tok.tokens,
2544 vec![Token::ElementStart(QName::new_unvalidated(
2545 Some(String::from("prefix")),
2546 String::from("validname")
2547 )),]
2548 );
2549 assert!(!tok.error);
2550 }
2551
2552 #[test]
2553 fn munch_s_attibute_asterisk_hit() {
2554 let mut tok = Tokenizer::new(String::from(""));
2555 assert!(tok.munch_s_attibute_asterisk());
2556 assert_eq!(tok.tokens, vec![]);
2557 assert!(!tok.error);
2558
2559 }
2561
2562 #[test]
2563 fn munch_empty_element_end_hit() {
2564 let mut tok = Tokenizer::new(String::from("/>"));
2565 assert!(tok.munch_empty_element_end());
2566 assert_eq!(tok.tokens, vec![Token::ElementEmptyEnd]);
2567 assert!(!tok.error);
2568 }
2569
2570 #[test]
2571 fn munch_content_hit() {
2572 let mut tok = Tokenizer::new(String::from(""));
2573 assert!(tok.munch_content());
2574 assert_eq!(tok.tokens, vec![]);
2575 assert!(!tok.error);
2576
2577 let mut tok = Tokenizer::new(String::from("character data"));
2578 assert!(tok.munch_content());
2579 assert_eq!(
2580 tok.tokens,
2581 vec![Token::Text(Text::new_unvalidated(String::from(
2582 "character data"
2583 )))]
2584 );
2585 assert!(!tok.error);
2586
2587 let mut tok = Tokenizer::new(String::from("character data"));
2588 assert!(tok.munch_content());
2589 assert_eq!(
2590 tok.tokens,
2591 vec![Token::Text(Text::new_unvalidated(String::from(
2592 "character data"
2593 )))]
2594 );
2595 assert!(!tok.error);
2596
2597 let mut tok = Tokenizer::new(String::from("<a/>"));
2598 assert!(tok.munch_content());
2599 assert_eq!(
2600 tok.tokens,
2601 vec![
2602 Token::ElementStart(QName::new_unvalidated(None, String::from("a"))),
2603 Token::ElementEmptyEnd
2604 ]
2605 );
2606 assert!(!tok.error);
2607
2608 let mut tok = Tokenizer::new(String::from("<a></a>"));
2609 assert!(tok.munch_content());
2610 assert_eq!(
2611 tok.tokens,
2612 vec![
2613 Token::ElementStart(QName::new_unvalidated(None, String::from("a"))),
2614 Token::ElementSTagEnd,
2615 Token::ElementEnd(QName::new_unvalidated(None, String::from("a"))),
2616 ]
2617 );
2618 assert!(!tok.error);
2619
2620 let mut tok = Tokenizer::new(String::from("&"));
2621 assert!(tok.munch_content());
2622 assert_eq!(
2623 tok.tokens,
2624 vec![Token::EntityRef(Name::new_unvalidated(String::from("amp"))),]
2625 );
2626 assert!(!tok.error);
2627
2628 let mut tok = Tokenizer::new(String::from("<![CDATA[mycdata]]>"));
2629 assert!(tok.munch_content());
2630 assert_eq!(
2631 tok.tokens,
2632 vec![Token::CDATASection(CDATASection::new_unvalidated(
2633 String::from("mycdata")
2634 ))]
2635 );
2636 assert!(!tok.error);
2637
2638 let mut tok = Tokenizer::new(String::from("<?pi my pi data?>"));
2639 assert!(tok.munch_content());
2640 assert_eq!(
2641 tok.tokens,
2642 vec![
2643 Token::PIStart,
2644 Token::PITarget(PITarget::new_unvalidated(String::from("pi"))),
2645 Token::PIData(PIData::new_unvalidated(String::from(" my pi data"))),
2646 Token::PIEnd
2647 ]
2648 );
2649 assert!(!tok.error);
2650
2651 let mut tok = Tokenizer::new(String::from("<!--my comment-->"));
2652 assert!(tok.munch_content());
2653 assert_eq!(
2654 tok.tokens,
2655 vec![Token::Comment(Comment::new_unvalidated(String::from(
2656 "my comment"
2657 ))),]
2658 );
2659 assert!(!tok.error);
2660
2661 }
2663
2664 #[test]
2665 fn munch_char_data_hit() {
2666 let mut tok = Tokenizer::new(String::from("Valid character data<"));
2667 assert!(tok.munch_char_data());
2668 assert_eq!(
2669 tok.tokens,
2670 vec![Token::Text(Text::new_unvalidated(String::from(
2671 "Valid character data"
2672 )))]
2673 );
2674 assert!(!tok.error);
2675
2676 }
2678
2679 #[test]
2680 fn munch_cd_sect_hit() {
2681 let mut tok = Tokenizer::new(String::from("<![CDATA[Valid cdata section]]>"));
2682 assert!(tok.munch_cd_sect());
2683 assert_eq!(
2684 tok.tokens,
2685 vec![Token::CDATASection(CDATASection::new_unvalidated(
2686 String::from("Valid cdata section")
2687 ))]
2688 );
2689 assert!(!tok.error);
2690
2691 }
2693
2694 #[test]
2695 fn munch_reference_hit() {
2696 let mut tok = Tokenizer::new(String::from("&"));
2697 assert!(tok.munch_reference());
2698 assert_eq!(
2699 tok.tokens,
2700 vec![Token::EntityRef(Name::new_unvalidated(String::from("amp"))),]
2701 );
2702 assert!(!tok.error);
2703
2704 let mut tok = Tokenizer::new(String::from("A"));
2705 assert!(tok.munch_reference());
2706 assert_eq!(
2707 tok.tokens,
2708 vec![Token::DecCharRef(DecCharRef::new_unvalidated('A')),]
2709 );
2710 assert!(!tok.error);
2711
2712 let mut tok = Tokenizer::new(String::from("😞"));
2713 assert!(tok.munch_reference());
2714 assert_eq!(
2715 tok.tokens,
2716 vec![Token::HexCharRef(HexCharRef::new_unvalidated('😞'))]
2717 );
2718 assert!(!tok.error);
2719 }
2720
2721 #[test]
2722 fn munch_char_ref_hit() {
2723 let mut tok = Tokenizer::new(String::from("A"));
2724 assert!(tok.munch_char_ref());
2725 assert_eq!(
2726 tok.tokens,
2727 vec![Token::DecCharRef(DecCharRef::new_unvalidated('A'))]
2728 );
2729 assert!(!tok.error);
2730
2731 let mut tok = Tokenizer::new(String::from("😞"));
2732 assert!(tok.munch_char_ref());
2733 assert_eq!(
2734 tok.tokens,
2735 vec![Token::HexCharRef(HexCharRef::new_unvalidated('😞'))]
2736 );
2737 assert!(!tok.error);
2738 }
2739
2740 #[test]
2741 fn munch_hexidecimal_char_ref_hit() {
2742 let mut tok = Tokenizer::new(String::from("😞"));
2743 assert!(tok.munch_hexidecimal_char_ref());
2744 assert_eq!(
2745 tok.tokens,
2746 vec![Token::HexCharRef(HexCharRef::new_unvalidated('😞'))]
2747 );
2748 assert!(!tok.error);
2749 }
2750
2751 #[test]
2752 fn munch_decimal_char_ref_hit() {
2753 let mut tok = Tokenizer::new(String::from("A"));
2754 assert!(tok.munch_decimal_char_ref());
2755 assert_eq!(
2756 tok.tokens,
2757 vec![Token::DecCharRef(DecCharRef::new_unvalidated('A'))]
2758 );
2759 assert!(!tok.error);
2760 }
2761
2762 #[test]
2763 fn munch_entity_ref_hit() {
2764 let mut tok = Tokenizer::new(String::from("&"));
2765 assert!(tok.munch_reference());
2766 assert_eq!(
2767 tok.tokens,
2768 vec![Token::EntityRef(Name::new_unvalidated(String::from("amp")))]
2769 );
2770 assert!(!tok.error);
2771 }
2772
2773 #[test]
2774 fn munch_etag_hit() {
2775 let mut tok = Tokenizer::new(String::from("</etag>"));
2776 assert!(tok.munch_etag());
2777 assert_eq!(
2778 tok.tokens,
2779 vec![Token::ElementEnd(QName::new_unvalidated(
2780 None,
2781 String::from("etag")
2782 ))]
2783 );
2784 assert!(!tok.error);
2785
2786 let mut tok = Tokenizer::new(String::from("</prefix:etag>"));
2787 assert!(tok.munch_etag());
2788 assert_eq!(
2789 tok.tokens,
2790 vec![Token::ElementEnd(QName::new_unvalidated(
2791 Some(String::from("prefix")),
2792 String::from("etag")
2793 ))]
2794 );
2795 assert!(!tok.error);
2796 }
2797
2798 #[test]
2799 fn munch_attribute_hit() {
2800 let mut tok = Tokenizer::new(String::from("name='value'"));
2801 assert!(tok.munch_attribute());
2802 assert_eq!(
2803 tok.tokens,
2804 vec![
2805 Token::AttributeStart,
2806 Token::AttributeName(QName::new_unvalidated(None, String::from("name"))),
2807 Token::AttributeValueStart,
2808 Token::AttributeValue(AttributeValue::new_unvalidated(String::from("value"))),
2809 Token::AttributeEnd
2810 ]
2811 );
2812 assert!(!tok.error);
2813 }
2814
2815 #[test]
2816 fn munch_namespace_hit() {
2817 let mut tok = Tokenizer::new(String::from("xmlns='http://defaultnamespace.com'"));
2818 assert!(tok.munch_namespace());
2819 assert_eq!(
2820 tok.tokens,
2821 vec![
2822 Token::NamespaceStart,
2823 Token::NamespaceDefault,
2824 Token::NamespaceValue(NamespaceValue::new_unvalidated(String::from(
2825 "http://defaultnamespace.com"
2826 ))),
2827 Token::NamespaceEnd,
2828 ]
2829 );
2830 assert!(!tok.error);
2831
2832 let mut tok = Tokenizer::new(String::from("xmlns:prefix='http://prefixednamespace.com'"));
2833 assert!(tok.munch_namespace());
2834 assert_eq!(
2835 tok.tokens,
2836 vec![
2837 Token::NamespaceStart,
2838 Token::NamespacePrefix(NCName::new_unvalidated(String::from("prefix"))),
2839 Token::NamespaceValue(NamespaceValue::new_unvalidated(String::from(
2840 "http://prefixednamespace.com"
2841 ))),
2842 Token::NamespaceEnd,
2843 ]
2844 );
2845 assert!(!tok.error);
2846 }
2847
2848 #[test]
2849 fn munch_namespace_prefix_hit() {
2850 let mut tok = Tokenizer::new(String::from("validprefix"));
2851 assert!(tok.munch_namespace_prefix());
2852 assert_eq!(
2853 tok.tokens,
2854 vec![Token::NamespacePrefix(NCName::new_unvalidated(
2855 String::from("validprefix")
2856 )),]
2857 );
2858 assert!(!tok.error);
2859 }
2860
2861 #[test]
2862 fn munch_namespace_value_hit() {
2863 let mut tok = Tokenizer::new(String::from("'namespacevalue'"));
2864 assert!(tok.munch_namespace_value());
2865 assert_eq!(
2866 tok.tokens,
2867 vec![
2868 Token::NamespaceValue(NamespaceValue::new_unvalidated(String::from(
2869 "namespacevalue"
2870 ))),
2871 Token::NamespaceEnd,
2872 ]
2873 );
2874 assert!(!tok.error);
2875
2876 }
2878
2879 #[test]
2880 fn munch_attribute_name_hit() {
2881 let mut tok = Tokenizer::new(String::from("validname"));
2882 assert!(tok.munch_attribute_name());
2883 assert_eq!(
2884 tok.tokens,
2885 vec![
2886 Token::AttributeStart,
2887 Token::AttributeName(QName::new_unvalidated(None, String::from("validname"))),
2888 ]
2889 );
2890 assert!(!tok.error);
2891
2892 let mut tok = Tokenizer::new(String::from("prefix:validname"));
2893 assert!(tok.munch_attribute_name());
2894 assert_eq!(
2895 tok.tokens,
2896 vec![
2897 Token::AttributeStart,
2898 Token::AttributeName(QName::new_unvalidated(
2899 Some(String::from("prefix")),
2900 String::from("validname")
2901 )),
2902 ]
2903 );
2904 assert!(!tok.error);
2905 }
2906
2907 #[test]
2908 fn munch_attribute_value_hit() {
2909 let mut tok = Tokenizer::new(String::from("'value'"));
2910 assert!(tok.munch_attribute_value());
2911 assert_eq!(
2912 tok.tokens,
2913 vec![
2914 Token::AttributeValueStart,
2915 Token::AttributeValue(AttributeValue::new_unvalidated(String::from("value"))),
2916 Token::AttributeEnd
2917 ]
2918 );
2919 assert!(!tok.error);
2920
2921 let mut tok = Tokenizer::new(String::from("'this & that'"));
2922 assert!(tok.munch_attribute_value());
2923 assert_eq!(
2924 tok.tokens,
2925 vec![
2926 Token::AttributeValueStart,
2927 Token::AttributeValue(AttributeValue::new_unvalidated(String::from("this "))),
2928 Token::EntityRef(Name::new_unvalidated(String::from("amp"))),
2929 Token::AttributeValue(AttributeValue::new_unvalidated(String::from(" that"))),
2930 Token::AttributeEnd
2931 ]
2932 );
2933 assert!(!tok.error);
2934
2935 let mut tok = Tokenizer::new(String::from("\"This & that — and A.\""));
2936 assert!(tok.munch_attribute_value());
2937 assert_eq!(
2938 tok.tokens,
2939 vec![
2940 Token::AttributeValueStart,
2941 Token::AttributeValue(AttributeValue::new_unvalidated(String::from("This "))),
2942 Token::EntityRef(Name::new_unvalidated(String::from("amp"))),
2943 Token::AttributeValue(AttributeValue::new_unvalidated(String::from(" that "))),
2944 Token::HexCharRef(HexCharRef::new_unvalidated('—')),
2945 Token::AttributeValue(AttributeValue::new_unvalidated(String::from(" and "))),
2946 Token::DecCharRef(DecCharRef::new_unvalidated('A')),
2947 Token::AttributeValue(AttributeValue::new_unvalidated(String::from("."))),
2948 Token::AttributeEnd
2949 ]
2950 );
2951 assert!(!tok.error);
2952 }
2953}
2954
2955fn is_digit(c: char) -> bool {
2956 match c {
2957 '0'..='9' => true,
2958 _ => false,
2959 }
2960}
2961
2962fn is_hexidecimal_digit(c: char) -> bool {
2963 match c {
2964 '0'..='9' => true,
2965 'a'..='f' => true,
2966 'A'..='F' => true,
2967 _ => false,
2968 }
2969}
2970
2971pub struct ParseTokenError {
2972 kind: ParseTokenErrorKind,
2973}
2974impl ParseTokenError {
2975 pub fn new(kind: ParseTokenErrorKind) -> ParseTokenError {
2976 ParseTokenError { kind }
2977 }
2978
2979 fn message(&self) -> &str {
2981 match self.kind {
2982 ParseTokenErrorKind::PITarget => "Error parsing processing instruction target.",
2983 ParseTokenErrorKind::DecCharRef => "Error parsing Decimal Character Reference value.",
2984 ParseTokenErrorKind::HexCharRef => {
2985 "Error parsing Hexidecimal Character Reference value."
2986 }
2987 ParseTokenErrorKind::FromU32 => "Error converting u32 to char.",
2988 }
2989 }
2990}
2991
2992pub enum ParseTokenErrorKind {
2993 PITarget,
2994 DecCharRef,
2995 HexCharRef,
2996 FromU32,
2997}