1#[cfg(target_arch = "x86_64")]
72use std::arch::x86_64::*;
73use std::collections::VecDeque;
74use std::fmt::Display;
75use std::intrinsics::transmute;
76use std::io::Read;
77use std::ops::Range;
78use std::str::Utf8Error;
79
80use multiversion::multiversion;
81use num_enum::TryFromPrimitive;
82use slice_deque::SliceDeque;
83
84const READ_SIZE: usize = 4 * 4096; const BLOCK_SIZE: usize = 64; const CH_OTHER: u8 = 0x00; const CH_OTHER_UTF8: u8 = 0x80; const CH_OTHER_AMPERSAND: u8 = 0x40; const CH_DOUBLE_QUOTE: u8 = 0x01;
95const CH_SINGLE_QUOTE: u8 = 0x02;
96const CH_WHITESPACE: u8 = 0x03;
97const CH_EXCL_QUEST_MARK: u8 = 0x04;
98const CH_SLASH: u8 = 0x05;
99const CH_LESS_THAN: u8 = 0x06;
100const CH_EQUAL: u8 = 0x07;
101const CH_GREATER_THAN: u8 = 0x08;
102
103
104fn char_to_code(c: u8) -> u8 {
106 match c {
107 b'\t' | b'\n' | b'\r' | b' ' => CH_WHITESPACE,
108 b'!' | b'?' => CH_EXCL_QUEST_MARK,
109 b'\"' => CH_DOUBLE_QUOTE,
110 b'\'' => CH_SINGLE_QUOTE,
111 b'&' => CH_OTHER_AMPERSAND,
112 b'/' => CH_SLASH,
113 b'<' => CH_LESS_THAN,
114 b'=' => CH_EQUAL,
115 b'>' => CH_GREATER_THAN,
116 128..=255 => CH_OTHER_UTF8,
117 _ => CH_OTHER,
118 }
119}
120
121fn classify_fallback(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>) {
125 let mut prev = CH_LESS_THAN; for (i, c) in input.iter().enumerate() {
128 let ch = char_to_code(*c);
129
130 if ch != CH_OTHER || prev != CH_OTHER {
131 chars.push(ch);
132 positions.push(i);
133 prev = ch;
134 }
135 }
136}
137
138#[cfg(target_arch = "x86_64")]
144#[target_feature(enable = "ssse3")]
145unsafe fn classify_ssse3(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>) {
146 const NOTHING: i8 = 0;
205 const GROUP_A: i8 = 0x10;
206 const GROUP_B: i8 = 0x20;
207 const GROUP_C: i8 = 0x02;
208 const GROUP_D: i8 = 0x06; const GROUP_E: i8 = 0x40;
210 const GROUP_F: i8 = 0x04;
211 const GROUP_G: i8 = 0x08;
212
213 let lo_nibbles_lookup = _mm_setr_epi8(
216 GROUP_G | GROUP_B,
217 GROUP_G | GROUP_B,
218 GROUP_G | GROUP_B,
219 GROUP_G,
220 GROUP_G,
221 GROUP_G,
222 GROUP_G | GROUP_C,
223 GROUP_G | GROUP_C,
224 GROUP_G,
225 GROUP_G | GROUP_A,
226 GROUP_G | GROUP_A,
227 GROUP_G,
228 GROUP_G | GROUP_E,
229 GROUP_G | GROUP_A | GROUP_E,
230 GROUP_G | GROUP_E,
231 GROUP_G | GROUP_D | GROUP_F,
232 );
233
234 let hi_nibbles_lookup = _mm_setr_epi8(
235 GROUP_A,
236 NOTHING,
237 GROUP_B | GROUP_C | GROUP_D,
238 GROUP_E | GROUP_F,
239 NOTHING,
240 NOTHING,
241 NOTHING,
242 NOTHING,
243 GROUP_G,
244 GROUP_G,
245 GROUP_G,
246 GROUP_G,
247 GROUP_G,
248 GROUP_G,
249 GROUP_G,
250 GROUP_G,
251 );
252
253 let vec_x00 = _mm_set1_epi8(0x00);
254 let vec_x0f = _mm_set1_epi8(0x0f);
255 let vec_x20 = _mm_set1_epi8(0x20);
256 let vec_x80 = _mm_set1_epi8(-128);
257
258 let compact_lookup = _mm_setr_epi8(
259 CH_WHITESPACE as i8,
260 CH_EXCL_QUEST_MARK as i8,
261 CH_DOUBLE_QUOTE as i8,
262 -1, CH_OTHER_AMPERSAND as i8,
264 CH_SINGLE_QUOTE as i8,
265 -1, -1, CH_OTHER_UTF8 as i8,
268 CH_SLASH as i8,
269 -1, CH_EXCL_QUEST_MARK as i8,
271 CH_LESS_THAN as i8,
272 CH_EQUAL as i8,
273 CH_GREATER_THAN as i8,
274 CH_OTHER as i8, );
276
277 let mut ptr = input.as_ptr();
279 let end = input.as_ptr().add(input.len());
280 debug_assert!(ptr as usize % BLOCK_SIZE == 0);
281 debug_assert!(end as usize % BLOCK_SIZE == 0);
282
283 let mut offset = 0;
284
285 #[repr(align(64))]
288 struct ScratchPad([u8; BLOCK_SIZE]);
289 let mut scratchpad = ScratchPad([0; 64]);
290
291 let mut prev_mask = 1u64 << 63;
296
297 while ptr < end {
299 let load_vec = |ptr: *const u8, out: *mut u8| -> u64 {
303 let input = _mm_load_si128(ptr as *const __m128i); let lo_nibbles = _mm_and_si128(input, vec_x0f);
313 let hi_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), vec_x0f);
314 let lo_translated = _mm_shuffle_epi8(lo_nibbles_lookup, lo_nibbles);
315 let hi_translated = _mm_shuffle_epi8(hi_nibbles_lookup, hi_nibbles);
316 let groups = _mm_and_si128(lo_translated, hi_translated);
317
318 let mask = _mm_cmpeq_epi8(groups, vec_x00);
322
323 let input = _mm_min_epu8(input, vec_x80);
327
328 let input = _mm_or_si128(input, mask);
332
333 let input = _mm_subs_epu8(input, vec_x20);
338
339 let input = _mm_xor_si128(input, groups);
346
347 let input = _mm_shuffle_epi8(compact_lookup, input);
355
356 _mm_store_si128(out as *mut __m128i, input);
358
359 (_mm_movemask_epi8(mask) as u16) as u64
361 };
362
363 let mask_a = load_vec(ptr, &mut scratchpad.0[0] as *mut u8);
365 let mask_b = load_vec(ptr.add(16), &mut scratchpad.0[16] as *mut u8);
366 let mask_c = load_vec(ptr.add(32), &mut scratchpad.0[32] as *mut u8);
367 let mask_d = load_vec(ptr.add(48), &mut scratchpad.0[48] as *mut u8);
368
369 let mut mask = !(mask_a | (mask_b << 16) | (mask_c << 32) | (mask_d << 48));
370
371 let tmp = mask;
374 mask = mask | (mask << 1) | (prev_mask >> 63);
375 prev_mask = tmp;
376
377 if mask != 0 {
379 let count = mask.count_ones() as usize;
380
381 chars.reserve(BLOCK_SIZE); positions.reserve(BLOCK_SIZE); let mut chars_ptr = chars.as_mut_ptr().add(chars.len());
385 let mut positions_ptr = positions.as_mut_ptr().add(positions.len());
386
387 'outer: loop {
389 for _ in 0..4 {
397 let index = mask.trailing_zeros() as usize;
398
399 let ch = *scratchpad.0.get_unchecked(index); let pos = index + offset;
401
402 *chars_ptr = ch;
403 chars_ptr = chars_ptr.add(1);
404 *positions_ptr = pos;
405 positions_ptr = positions_ptr.add(1);
406
407 mask = mask & mask.wrapping_sub(1);
408 if mask == 0 {
409 break 'outer;
410 }
411 }
412 }
413
414 chars.set_len(chars.len() + count);
415 positions.set_len(positions.len() + count);
416 }
417
418 ptr = ptr.add(BLOCK_SIZE);
419 offset += BLOCK_SIZE;
420 }
421}
422
423#[cfg(target_arch = "x86_64")]
429#[target_feature(enable = "avx2")]
430unsafe fn classify_avx2(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>) {
431 const NOTHING: i8 = 0;
434 const GROUP_A: i8 = 0x10;
435 const GROUP_B: i8 = 0x20;
436 const GROUP_C: i8 = 0x02;
437 const GROUP_D: i8 = 0x06;
438 const GROUP_E: i8 = 0x40;
439 const GROUP_F: i8 = 0x04;
440 const GROUP_G: i8 = 0x08;
441
442 let lo_nibbles_lookup = _mm256_setr_epi8(
443 GROUP_G | GROUP_B,
444 GROUP_G | GROUP_B,
445 GROUP_G | GROUP_B,
446 GROUP_G,
447 GROUP_G,
448 GROUP_G,
449 GROUP_G | GROUP_C,
450 GROUP_G | GROUP_C,
451 GROUP_G,
452 GROUP_G | GROUP_A,
453 GROUP_G | GROUP_A,
454 GROUP_G,
455 GROUP_G | GROUP_E,
456 GROUP_G | GROUP_A | GROUP_E,
457 GROUP_G | GROUP_E,
458 GROUP_G | GROUP_D | GROUP_F,
459
460 GROUP_G | GROUP_B,
461 GROUP_G | GROUP_B,
462 GROUP_G | GROUP_B,
463 GROUP_G,
464 GROUP_G,
465 GROUP_G,
466 GROUP_G | GROUP_C,
467 GROUP_G | GROUP_C,
468 GROUP_G,
469 GROUP_G | GROUP_A,
470 GROUP_G | GROUP_A,
471 GROUP_G,
472 GROUP_G | GROUP_E,
473 GROUP_G | GROUP_A | GROUP_E,
474 GROUP_G | GROUP_E,
475 GROUP_G | GROUP_D | GROUP_F,
476 );
477
478 let hi_nibbles_lookup = _mm256_setr_epi8(
479 GROUP_A,
480 NOTHING,
481 GROUP_B | GROUP_C | GROUP_D,
482 GROUP_E | GROUP_F,
483 NOTHING,
484 NOTHING,
485 NOTHING,
486 NOTHING,
487 GROUP_G,
488 GROUP_G,
489 GROUP_G,
490 GROUP_G,
491 GROUP_G,
492 GROUP_G,
493 GROUP_G,
494 GROUP_G,
495
496 GROUP_A,
497 NOTHING,
498 GROUP_B | GROUP_C | GROUP_D,
499 GROUP_E | GROUP_F,
500 NOTHING,
501 NOTHING,
502 NOTHING,
503 NOTHING,
504 GROUP_G,
505 GROUP_G,
506 GROUP_G,
507 GROUP_G,
508 GROUP_G,
509 GROUP_G,
510 GROUP_G,
511 GROUP_G,
512 );
513
514 let vec_x00 = _mm256_set1_epi8(0x00);
515 let vec_x0f = _mm256_set1_epi8(0x0f);
516 let vec_x20 = _mm256_set1_epi8(0x20);
517 let vec_x80 = _mm256_set1_epi8(-128);
518
519 let compact_lookup = _mm256_setr_epi8(
520 CH_WHITESPACE as i8,
521 CH_EXCL_QUEST_MARK as i8,
522 CH_DOUBLE_QUOTE as i8,
523 -1, CH_OTHER_AMPERSAND as i8,
525 CH_SINGLE_QUOTE as i8,
526 -1, -1, CH_OTHER_UTF8 as i8,
529 CH_SLASH as i8,
530 -1, CH_EXCL_QUEST_MARK as i8,
532 CH_LESS_THAN as i8,
533 CH_EQUAL as i8,
534 CH_GREATER_THAN as i8,
535 CH_OTHER as i8,
536
537 CH_WHITESPACE as i8,
538 CH_EXCL_QUEST_MARK as i8,
539 CH_DOUBLE_QUOTE as i8,
540 -1, CH_OTHER_AMPERSAND as i8,
542 CH_SINGLE_QUOTE as i8,
543 -1, -1, CH_OTHER_UTF8 as i8,
546 CH_SLASH as i8,
547 -1, CH_EXCL_QUEST_MARK as i8,
549 CH_LESS_THAN as i8,
550 CH_EQUAL as i8,
551 CH_GREATER_THAN as i8,
552 CH_OTHER as i8,
553 );
554
555 let mut ptr = input.as_ptr();
556 let end = input.as_ptr().add(input.len());
557 debug_assert!(ptr as usize % BLOCK_SIZE == 0);
558 debug_assert!(end as usize % BLOCK_SIZE == 0);
559
560 let mut offset = 0;
561
562 #[repr(align(64))]
563 struct ScratchPad([u8; BLOCK_SIZE]);
564 let mut scratchpad = ScratchPad([0; 64]);
565
566 let mut prev_mask = 1u64 << 63;
567
568 while ptr < end {
569 let mask_a = {
570 let input = _mm256_load_si256(ptr as *const __m256i); let lo_nibbles = _mm256_and_si256(input, vec_x0f);
573 let hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(input, 4), vec_x0f);
574 let lo_translated = _mm256_shuffle_epi8(lo_nibbles_lookup, lo_nibbles);
575 let hi_translated = _mm256_shuffle_epi8(hi_nibbles_lookup, hi_nibbles);
576 let groups = _mm256_and_si256(lo_translated, hi_translated);
577
578 let mask = _mm256_cmpeq_epi8(groups, vec_x00);
579
580 let input = _mm256_min_epu8(input, vec_x80);
581 let input = _mm256_or_si256(input, mask);
582 let input = _mm256_subs_epu8(input, vec_x20);
583 let input = _mm256_xor_si256(input, groups);
584 let input = _mm256_shuffle_epi8(compact_lookup, input);
585
586 _mm256_store_si256(&mut scratchpad.0[0] as *mut u8 as *mut __m256i, input);
587
588 (_mm256_movemask_epi8(mask) as u32) as u64
589 };
590
591 let mask_b = {
592 let input = _mm256_load_si256(ptr.add(32) as *const __m256i); let lo_nibbles = _mm256_and_si256(input, vec_x0f);
595 let hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(input, 4), vec_x0f);
596 let lo_translated = _mm256_shuffle_epi8(lo_nibbles_lookup, lo_nibbles);
597 let hi_translated = _mm256_shuffle_epi8(hi_nibbles_lookup, hi_nibbles);
598 let groups = _mm256_and_si256(lo_translated, hi_translated);
599
600 let mask = _mm256_cmpeq_epi8(groups, vec_x00);
601
602 let input = _mm256_min_epu8(input, vec_x80);
603 let input = _mm256_or_si256(input, mask);
604 let input = _mm256_subs_epu8(input, vec_x20);
605 let input = _mm256_xor_si256(input, groups);
606 let input = _mm256_shuffle_epi8(compact_lookup, input);
607
608 _mm256_store_si256(&mut scratchpad.0[32] as *mut u8 as *mut __m256i, input);
609
610 (_mm256_movemask_epi8(mask) as u32) as u64
611 };
612
613 let mut mask = !(mask_a | (mask_b << 32));
614
615 let tmp = mask;
616 mask = mask | (mask << 1) | (prev_mask >> 63);
617 prev_mask = tmp;
618
619 if mask != 0 {
620 let count = mask.count_ones() as usize;
621
622 chars.reserve(BLOCK_SIZE); positions.reserve(BLOCK_SIZE); let mut chars_ptr = chars.as_mut_ptr().add(chars.len());
625 let mut positions_ptr = positions.as_mut_ptr().add(positions.len());
626
627 'outer: loop {
628 for _ in 0..4 {
631 let index = mask.trailing_zeros() as usize;
632
633 let ch = *scratchpad.0.get_unchecked(index); let pos = index + offset;
635
636 *chars_ptr = ch;
637 chars_ptr = chars_ptr.add(1);
638 *positions_ptr = pos;
639 positions_ptr = positions_ptr.add(1);
640
641 mask = mask & mask.wrapping_sub(1);
642 if mask == 0 {
643 break 'outer;
644 }
645 }
646 }
647
648 chars.set_len(chars.len() + count);
649 positions.set_len(positions.len() + count);
650 }
651
652 ptr = ptr.add(BLOCK_SIZE);
653 offset += BLOCK_SIZE;
654 }
655}
656
657#[multiversion]
658#[specialize(target = "[x86|x86_64]+avx2", fn = "classify_avx2", unsafe = true)]
659#[specialize(target = "[x86|x86_64]+ssse3", fn = "classify_ssse3", unsafe = true)]
660fn classify(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>) {
663 classify_fallback(input, chars, positions)
664}
665
666const BIT_EMIT: u8 = 0b1000_0000;
668
669const BIT_SAVE_START: u8 = 0b0100_0000;
672
673const BIT_SAVE_END: u8 = 0b0010_0000;
676
677const PARSE_DFA: [u8; 19 * 9] = [
6800x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
6820x01, 0x00, 0x00, 0xaa, 0x00, 0xa3, 0x00, 0x00, 0xa7,
6830x02, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00, 0x00, 0xa7,
6840x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87,
6850x04, 0x04, 0x04, 0x2c, 0x04, 0x04, 0xa8, 0x04, 0x04,
6860x05, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0xaf, 0x00,
6870x06, 0xb2, 0x06, 0x06, 0x06, 0x06, 0x00, 0x06, 0x06,
6880x44, 0x44, 0x44, 0x07, 0x44, 0x44, 0x08, 0x44, 0x44,
6890x41, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
6900x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
6910x45, 0x00, 0x00, 0x0a, 0x00, 0x03, 0x00, 0x00, 0x07,
6920x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x07,
6930x04, 0x04, 0x04, 0x0c, 0x04, 0x04, 0x88, 0x04, 0x04,
6940x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x0f, 0x00,
6950x0e, 0x0e, 0xb2, 0x0e, 0x0e, 0x0e, 0x00, 0x0e, 0x0e,
6960x00, 0x10, 0x11, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00,
6970x46, 0x00, 0x46, 0x46, 0x46, 0x46, 0x00, 0x46, 0x46,
6980x4e, 0x4e, 0x00, 0x4e, 0x4e, 0x4e, 0x00, 0x4e, 0x4e,
6990x00, 0x00, 0x00, 0x0a, 0x00, 0x03, 0x00, 0x00, 0x07,
700];
701
702const BIT_HAS_UTF8: u8 = 0b1000_0000;
706
707const BIT_HAS_ESCAPES: u8 = 0b0100_0000;
710
711#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
713#[repr(u8)]
714pub enum EventCode {
715 StartTag = 0o1,
720
721 EndTag = 0o2,
723
724 EndTagImmediate = 0o3,
726
727 Text = 0o4,
729
730 AttributeName = 0o5,
733
734 AttributeValue = 0o6,
737
738 Eof = 0o7,
740}
741
742#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, TryFromPrimitive)]
745#[repr(u8)]
746enum State {
747 Exception = 0,
750
751 Outside = EventCode::Eof as u8, TagStart = 0o10,
755 TagEnd = 0o11,
756
757 TagName = EventCode::StartTag as u8, TagEndName = EventCode::EndTag as u8, InTag = 0o12,
761 InTagEnd = 0o13,
762
763 AttrName = EventCode::AttributeName as u8, AfterAttrName = 0o14, AfterAttrEq = 0o15,
768
769 AttrValueDoubleQuotedOpen = 0o21,
770 AttrValueDoubleQuoted = EventCode::AttributeValue as u8, AttrValueSingleQuotedOpen = 0o20,
773 AttrValueSingleQuoted = EventCode::AttributeValue as u8 | 0o10, AfterAttrValue = 0o17,
776
777 AfterImmediateEndTag = EventCode::EndTagImmediate as u8, InText = EventCode::Text as u8, InTextEndWhitespace = 0o22,
781
782 HandledException = 0o100,
787 InProcessingInstruction = 0o101,
788 InCommentOrCDATA = 0o102,
789 InComment = 0o103,
790 InCDATA = 0o104,
791}
792
793#[derive(Clone, Debug)]
794struct ParserState {
795 state: State,
796 flags: u8,
797 start_position: usize,
798 end_position: usize,
799 exception_up_to: usize,
800}
801
802impl Default for ParserState {
803 fn default() -> Self {
804 Self {
805 state: State::Outside,
806 flags: 0,
807 start_position: 0,
808 end_position: 0,
809 exception_up_to: 0,
810 }
811 }
812}
813
814#[derive(Debug)]
815struct StateMachine<'e, 'state> {
816 events: &'e mut VecDeque<InternalEvent>,
817 errors: &'e mut VecDeque<MalformedXmlError>,
818 state: &'state mut ParserState,
819 position_offset: usize,
820}
821
822impl<'e, 'state> StateMachine<'e, 'state> {
823 fn run(&mut self, chars: &[u8], positions: &[usize], buffer: &[u8]) {
824 if self.state.state as u8 > State::HandledException as u8 {
826 self.continue_exception(buffer);
827
828 if self.state.state as u8 > State::HandledException as u8 {
829 return;
831 }
832 }
833
834 let mut s = (*self.state).clone();
835
836 for (ch, pos) in chars.iter().copied().zip(positions.iter().copied()) {
838 let out_pos = pos + self.position_offset;
839
840 if out_pos < s.exception_up_to {
841 continue;
844 }
845
846 s.flags |= ch;
850
851 let dfa_index = s.state as u8 * 9 + (ch & 0b1111);
852 let transition = unsafe { *PARSE_DFA.get_unchecked(dfa_index as usize) };
853
854 if transition == State::Exception as u8 {
855 *self.state = s;
856 self.handle_exception(out_pos, buffer);
857 s = (*self.state).clone();
858
859 if s.state as u8 > State::HandledException as u8 {
860 return;
863 } else {
864 continue;
865 }
866 }
867
868 if transition & BIT_SAVE_END != 0 {
869 s.end_position = out_pos;
870 }
871
872 if transition & BIT_EMIT != 0 {
873 debug_assert!(s.state as u8 & 0b1100_0000 == 0, "The highest two bits in State are set now? We must mask them away before ORing with flags.");
874 let code = s.state as u8 | (s.flags & 0b1100_0000);
875 self.events.push_back(InternalEvent {
876 start: s.start_position,
877 end: s.end_position,
878 code,
879 });
880 s.flags = 0;
881 }
882
883 if transition & BIT_SAVE_START != 0 {
884 s.start_position = out_pos;
885 }
886
887 s.state = match transition & 0b0001_1111 {
888 #[cfg(debug_assertions)]
889 state_num => State::try_from(state_num).unwrap(),
890
891 #[cfg(not(debug_assertions))]
892 state_num => unsafe { std::mem::transmute::<u8, State>(state_num) },
893 };
894 }
895
896 *self.state = s;
897 }
898
899 #[cold]
903 fn handle_exception(&mut self, pos: usize, buffer: &[u8]) {
904 match (self.state.state, buffer[pos]) {
905 (State::TagStart, b'?') =>
906 self.handle_processing_instruction(pos, buffer),
907
908 (State::TagStart, b'!') =>
909 self.handle_comment_or_cdata(pos, buffer),
914
915 _ =>
916 self.handle_error(pos, buffer),
917 }
918 }
919
920 #[cold]
921 fn continue_exception(&mut self, buffer: &[u8]) {
922 match self.state.state {
923 State::InProcessingInstruction =>
924 self.continue_processing_instruction(buffer),
925
926 State::InCommentOrCDATA =>
927 self.continue_comment_or_cdata(buffer),
928
929 State::InComment =>
930 self.continue_comment(buffer),
931
932 State::InCDATA =>
933 self.continue_cdata(buffer),
934
935 _ =>
936 unreachable!("continue_exception was called with state {:?}", self.state.state),
937 }
938 }
939
940 fn handle_processing_instruction(&mut self, pos: usize, buffer: &[u8]) {
941 self.state.start_position = pos + 1;
942 self.state.state = State::InProcessingInstruction;
943
944 self.continue_processing_instruction(buffer);
945 }
946
947 fn continue_processing_instruction(&mut self, buffer: &[u8]) {
948 debug_assert_eq!(self.state.state, State::InProcessingInstruction);
952
953 let pi_start = self.state.start_position;
954 let pi_end = twoway::find_bytes(&buffer[pi_start..], b"?>");
955 if let Some(pi_end) = pi_end {
956 let pi_end = pi_end + pi_start;
957 self.state.exception_up_to = pi_end + 2;
958 self.state.state = State::Outside;
959 self.state.flags = 0;
960 } else {
961 }
963 }
964
965 fn handle_comment_or_cdata(&mut self, pos: usize, buffer: &[u8]) {
966 self.state.start_position = pos + 1;
967 self.state.state = State::InCommentOrCDATA;
968
969 self.continue_comment_or_cdata(buffer);
970 }
971
972 fn continue_comment_or_cdata(&mut self, buffer: &[u8]) {
973 debug_assert_eq!(self.state.state, State::InCommentOrCDATA);
974 let pos = self.state.start_position;
975
976 let buffer_slice = &buffer[pos..];
977
978 if buffer_slice.starts_with(b"--") {
979 self.state.start_position = pos + 2;
980 self.state.state = State::InComment;
981 return self.continue_comment(buffer);
982 }
983
984 if buffer_slice.starts_with(b"[CDATA[") {
985 self.state.start_position = pos + 7;
986 self.state.state = State::InCDATA;
987 return self.continue_cdata(buffer);
988 }
989
990 if buffer_slice.len() < 2 && b"--".starts_with(buffer_slice) {
991 } else if buffer_slice.len() < 7 && b"[CDATA[".starts_with(buffer_slice) {
993 } else {
995 self.handle_error(pos, buffer);
997 }
998 }
999
1000 fn continue_comment(&mut self, buffer: &[u8]) {
1001 debug_assert_eq!(self.state.state, State::InComment);
1005
1006 let comment_start = self.state.start_position;
1007 let comment_end = twoway::find_bytes(&buffer[comment_start..], b"-->");
1008 if let Some(comment_end) = comment_end {
1009 let comment_end = comment_end + comment_start;
1010 self.state.exception_up_to = comment_end + 3;
1011 self.state.state = State::Outside;
1012 self.state.flags = 0;
1013 } else {
1014 }
1016 }
1017
1018 fn continue_cdata(&mut self, buffer: &[u8]) {
1019 debug_assert_eq!(self.state.state, State::InCDATA);
1022
1023 let cdata_start = self.state.start_position;
1024 let cdata_end = twoway::find_bytes(&buffer[cdata_start..], b"]]>");
1025 if let Some(cdata_end) = cdata_end {
1026 let cdata_end = cdata_end + cdata_start;
1027
1028 let code = EventCode::Text as u8 | BIT_HAS_UTF8; self.events.push_back(InternalEvent {
1030 start: cdata_start,
1031 end: cdata_end,
1032 code,
1033 });
1034
1035 self.state.exception_up_to = cdata_end + 3;
1036 self.state.state = State::Outside;
1037 self.state.flags = 0;
1038 } else {
1039 }
1041 }
1042
1043 fn handle_error(&mut self, pos: usize, buffer: &[u8]) {
1044 self.events.push_back(InternalEvent {
1046 start: pos,
1047 end: pos,
1048 code: 0,
1049 });
1050
1051 let context_slice = pos.saturating_sub(10)..(pos + 10).min(buffer.len());
1052 self.errors.push_back(MalformedXmlError {
1053 kind: MalformedXMLKind::Other,
1054 context: Some(String::from_utf8_lossy(&buffer[context_slice]).into_owned()),
1055 });
1056
1057 let continue_from = twoway::find_bytes(&buffer[(pos+1)..], b">").unwrap_or(0);
1062 self.state.exception_up_to = pos + 1 + continue_from;
1063 self.state.state = State::Outside;
1064 self.state.flags = 0;
1065 }
1067}
1068
1069#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
1071pub enum MalformedXMLKind {
1072 UnexpectedEof,
1074
1075 Other,
1077}
1078
1079#[derive(Clone, Debug)]
1081pub struct MalformedXmlError {
1082 kind: MalformedXMLKind,
1084
1085 context: Option<String>,
1087}
1088
1089impl std::fmt::Display for MalformedXmlError {
1090 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
1091 write!(f, "Malformed XML")?;
1092
1093 if self.kind != MalformedXMLKind::Other {
1094 let msg = match self.kind {
1095 MalformedXMLKind::UnexpectedEof => "unexpected EOF",
1096 MalformedXMLKind::Other => "",
1097 };
1098
1099 write!(f, " ({})", msg)?;
1100 }
1101
1102 if let Some(context) = &self.context {
1103 write!(f, ": {:?}", context)?;
1104 }
1105
1106 Ok(())
1107 }
1108}
1109
1110impl std::error::Error for MalformedXmlError {}
1111
1112#[derive(Debug)]
1114pub enum ParseError {
1115 IO(std::io::Error),
1117
1118 MalformedXML(MalformedXmlError),
1120}
1121
1122impl std::fmt::Display for ParseError {
1123 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
1124 match self {
1125 ParseError::IO(err) => Display::fmt(err, f),
1126 ParseError::MalformedXML(err) => Display::fmt(err, f),
1127 }
1128 }
1129}
1130
1131impl std::error::Error for ParseError {}
1132
1133impl From<std::io::Error> for ParseError {
1134 fn from(err: std::io::Error) -> Self {
1135 ParseError::IO(err)
1136 }
1137}
1138
1139impl From<MalformedXmlError> for ParseError {
1140 fn from(err: MalformedXmlError) -> Self {
1141 ParseError::MalformedXML(err)
1142 }
1143}
1144
1145#[derive(Debug)]
1147pub enum DecodeError {
1148 BadUtf8,
1150
1151 BadEscape,
1153}
1154
1155impl std::fmt::Display for DecodeError {
1156 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
1157 match self {
1158 DecodeError::BadUtf8 => write!(f, "bad utf8"),
1159 DecodeError::BadEscape => write!(f, "bad xml escape"),
1160 }
1161 }
1162}
1163
1164impl std::error::Error for DecodeError {}
1165
1166impl From<std::str::Utf8Error> for DecodeError {
1167 fn from(_err: std::str::Utf8Error) -> Self {
1168 DecodeError::BadUtf8
1169 }
1170}
1171
1172#[derive(Debug)]
1173struct InternalEvent {
1174 start: usize,
1175 end: usize,
1176 code: u8,
1177}
1178
1179#[derive(Debug)]
1184pub struct Event<'a> {
1185 slice: &'a mut [u8],
1187
1188 code: u8,
1193}
1194
1195impl<'a> Event<'a> {
1196 #[inline(always)]
1197 fn new(internal_event: &InternalEvent, buffer: &'a mut [u8]) -> Self {
1198 Self {
1199 slice: &mut buffer[internal_event.start..internal_event.end],
1200 code: internal_event.code,
1201 }
1202 }
1203
1204 fn eof() -> Self {
1205 Self {
1206 slice: &mut [],
1207 code: EventCode::Eof as u8,
1208 }
1209 }
1210
1211 #[inline(always)]
1213 pub fn code(&self) -> EventCode {
1214 unsafe {
1217 transmute(self.code & 0b00000111)
1218 }
1219 }
1220
1221 #[cold]
1222 fn check_utf8(&self) -> Result<(), Utf8Error> {
1223 std::str::from_utf8(self.slice).map(|_| ())
1224 }
1225
1226 #[cold]
1227 fn decode_escapes(&mut self) -> Result<(), DecodeError> {
1228 fn find_next_escape(slice: &[u8], from: usize) -> Option<(usize, usize, char)> {
1229 let start = slice[from..].iter().position(|c| *c == b'&')? + from;
1230 let end = slice[start..].iter().position(|c| *c == b';')? + start;
1231
1232 match &slice[start+1..end] {
1233 b"lt" => Some((start, end, '<')),
1234 b"gt" => Some((start, end, '>')),
1235 b"amp" => Some((start, end, '&')),
1236 b"apos" => Some((start, end, '\'')),
1237 b"quot" => Some((start, end, '"')),
1238
1239 [b'#', b'x', hexa @ ..] => {
1240 let c = std::char::from_u32(btoi::btou_radix(hexa, 16).ok()?)?;
1241 Some((start, end, c))
1242 },
1243
1244 [b'#', decimal @ ..] => {
1245 let c = std::char::from_u32(btoi::btou(decimal).ok()?)?;
1246 Some((start, end, c))
1247 },
1248
1249 _ => None,
1250 }
1251 }
1252
1253 let mut escape = match find_next_escape(&self.slice, 0) {
1254 Some(escape) => escape,
1255 None => return Ok(()),
1256 };
1257 let mut current_end = escape.0;
1258
1259 loop {
1260 let next_escape = find_next_escape(&self.slice, escape.1);
1261 let next_escape_end = next_escape.map(|n| n.0).unwrap_or(self.slice.len());
1262
1263 let utf8_len = escape.2.encode_utf8(&mut self.slice[current_end..]).len();
1264 debug_assert!(utf8_len <= escape.1 - escape.0, "We got XML escape that is shorter than its UTF-8 representation. That should not be possible.");
1265 current_end += utf8_len;
1266
1267 self.slice.copy_within((escape.1 + 1)..next_escape_end, current_end);
1268 current_end += next_escape_end - escape.1 - 1;
1269
1270 match next_escape {
1271 Some(next_escape) => {
1272 escape = next_escape;
1273 }
1274 None => {
1275 let mut tmp = &mut [][..];
1277 std::mem::swap(&mut tmp, &mut self.slice);
1278 tmp = &mut tmp[..current_end];
1279 std::mem::swap(&mut tmp, &mut self.slice);
1280
1281 break;
1282 }
1283 }
1284 }
1285
1286 Ok(())
1287 }
1288
1289 #[inline(always)]
1291 pub fn get_bytes(&mut self) -> Result<&[u8], DecodeError> {
1292 if self.code & BIT_HAS_ESCAPES != 0 {
1293 self.decode_escapes()?;
1294 self.code &= !BIT_HAS_ESCAPES;
1295 }
1296
1297 Ok(self.slice)
1298 }
1299
1300 #[inline(always)]
1302 pub fn get_str(&mut self) -> Result<&str, DecodeError> {
1303 if self.code & BIT_HAS_ESCAPES != 0 {
1304 self.decode_escapes()?;
1305 self.code &= !BIT_HAS_ESCAPES;
1306 }
1307
1308 if self.code & BIT_HAS_UTF8 != 0 {
1309 self.check_utf8()?;
1310 self.code &= !BIT_HAS_UTF8;
1311 }
1312
1313 unsafe {
1316 Ok(std::str::from_utf8_unchecked(self.slice))
1317 }
1318 }
1319}
1320
1321pub struct Parser<R: Read> {
1323 reader: R,
1325
1326 buffer: SliceDeque<u8>,
1328
1329 buffer_parsed: Range<usize>,
1337
1338 reached_eof: bool,
1340
1341 control_characters: Vec<u8>,
1342 control_character_positions: Vec<usize>,
1343
1344 state: ParserState,
1346
1347 events: VecDeque<InternalEvent>,
1348 errors: VecDeque<MalformedXmlError>,
1349}
1350
1351impl<R: Read> Parser<R> {
1352 pub fn new(reader: R) -> Self {
1354 Self {
1355 reader,
1356 buffer: SliceDeque::with_capacity(READ_SIZE * 2),
1357 buffer_parsed: 0..0,
1358 reached_eof: false,
1359 control_characters: Default::default(),
1360 control_character_positions: Default::default(),
1361 state: ParserState::default(),
1362 events: VecDeque::new(),
1363 errors: VecDeque::new(),
1364 }
1365 }
1366
1367 fn parse_more(&mut self) -> Result<(), std::io::Error> {
1369 debug_assert!(self.events.is_empty(), "The `start` and `end` in events will not be valid after this function runs!");
1370
1371 let to_throw_away = (self.state.start_position / BLOCK_SIZE) * BLOCK_SIZE; unsafe { self.buffer.move_head_unchecked(to_throw_away as isize);
1375 }
1376 self.state.start_position -= to_throw_away;
1377 self.state.end_position = self.state.end_position.saturating_sub(to_throw_away);
1378 self.state.exception_up_to = self.state.exception_up_to.saturating_sub(to_throw_away);
1379 debug_assert!(self.control_characters.is_empty());
1380 debug_assert!(self.control_character_positions.is_empty());
1381
1382 self.buffer.reserve(READ_SIZE); unsafe {
1386 let mut total_bytes_got = 0;
1387 loop {
1388 let uninit_buffer = self.buffer.tail_head_slice();
1389 let bytes_got = self.reader.read(uninit_buffer)?;
1395
1396 if bytes_got == 0 {
1397 self.reached_eof = true;
1398
1399 let padding = (BLOCK_SIZE - (self.buffer.as_ptr() as usize + self.buffer.len()) % BLOCK_SIZE) % BLOCK_SIZE;
1403
1404 for _ in 0..padding {
1405 self.buffer.push_back(b'\0');
1406 }
1407
1408 break;
1409 }
1410
1411 self.buffer.move_tail_unchecked(bytes_got as isize);
1412
1413 total_bytes_got += bytes_got;
1417 if total_bytes_got > READ_SIZE / 2 {
1418 break;
1419 }
1420 }
1421 }
1422
1423 let slice = self.buffer.as_slice();
1427 self.buffer_parsed = (self.buffer_parsed.end - to_throw_away)..(slice.len() / BLOCK_SIZE * BLOCK_SIZE);
1428 let slice = &slice[self.buffer_parsed.clone()];
1429
1430 classify(slice, &mut self.control_characters, &mut self.control_character_positions);
1431
1432 let mut state_machine = StateMachine {
1433 events: &mut self.events,
1434 errors: &mut self.errors,
1435 state: &mut self.state,
1436 position_offset: self.buffer_parsed.start,
1437 };
1438 state_machine.run(&self.control_characters, &self.control_character_positions, &self.buffer);
1439 self.control_characters.clear();
1440 self.control_character_positions.clear();
1441
1442 Ok(())
1444 }
1445
1446 #[inline]
1448 pub fn peek(&mut self) -> Result<Event, ParseError> {
1449 while self.events.is_empty() {
1450 if self.reached_eof {
1451 return Ok(Event::eof());
1452 }
1453 self.parse_more()?;
1454 }
1455
1456 let internal_event = self.events.front().unwrap();
1457 if internal_event.code == 0 {
1458 return Err(self.errors.front().cloned().unwrap().into());
1459 }
1460 let event = Event::new(&internal_event, self.buffer.as_mut_slice());
1461
1462 Ok(event)
1463 }
1464
1465 #[inline]
1467 pub fn next(&mut self) -> Result<Event, ParseError> {
1468 while self.events.is_empty() {
1469 if self.reached_eof {
1470 return self.eof_transition();
1471 }
1472 self.parse_more()?;
1473 }
1474
1475 let internal_event = self.events.pop_front().unwrap();
1476 if internal_event.code == 0 {
1477 return Err(self.errors.pop_front().unwrap().into());
1478 }
1479 let event = Event::new(&internal_event, self.buffer.as_mut_slice());
1480
1481 Ok(event)
1482 }
1483
1484 pub fn finish_tag(&mut self, mut depth: usize) -> Result<(), ParseError> {
1492 while depth > 0 {
1493 match self.next()?.code() {
1494 EventCode::StartTag => depth += 1,
1495 EventCode::EndTag | EventCode::EndTagImmediate => depth -= 1,
1496 EventCode::AttributeName | EventCode::AttributeValue | EventCode::Text => { },
1497 EventCode::Eof => return Err(ParseError::MalformedXML(MalformedXmlError {
1498 kind: MalformedXMLKind::UnexpectedEof,
1499 context: None,
1500 })),
1501 }
1502 }
1503
1504 Ok(())
1505 }
1506
1507 #[cold]
1508 fn eof_transition(&mut self) -> Result<Event, ParseError> {
1509 match self.state.state {
1510 State::Outside | State::InText => Ok(Event::eof()),
1512
1513 _ => Err(ParseError::MalformedXML(MalformedXmlError {
1514 kind: MalformedXMLKind::UnexpectedEof,
1515 context: None,
1516 })),
1517 }
1518 }
1519}
1520
1521#[cfg(test)]
1522mod tests {
1523 use std::io::Cursor;
1524
1525 use super::*;
1526
1527 fn run_all_classify(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>) {
1528 classify_fallback(input, chars, positions);
1529
1530 let mut chars_alt = Vec::new();
1531 let mut positions_alt = Vec::new();
1532
1533 if cfg!(target_arch = "x86_64") {
1534 if is_x86_feature_detected!("ssse3") {
1535 chars_alt.clear();
1536 positions_alt.clear();
1537 unsafe { classify_ssse3(input, &mut chars_alt, &mut positions_alt); }
1538 assert_eq!(chars, &chars_alt);
1539 assert_eq!(positions, &positions_alt);
1540 }
1541 if is_x86_feature_detected!("avx2") {
1542 chars_alt.clear();
1543 positions_alt.clear();
1544 unsafe { classify_avx2(input, &mut chars_alt, &mut positions_alt); }
1545 assert_eq!(chars, &chars_alt);
1546 assert_eq!(positions, &positions_alt);
1547 }
1548 }
1549 }
1550
1551 #[test]
1552 fn test_classify() {
1553 let mut input = SliceDeque::from(&b"aaa<bbb=ccc>ddd eee\"fff'ggg!hhh?iii/jjj-------------------------"[..]);
1554 let mut chars = Vec::new();
1555 let mut positions = Vec::new();
1556
1557 run_all_classify(input.as_slice(), &mut chars, &mut positions);
1558
1559 assert_eq!(positions, &[
1560 0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31, 32, 35, 36
1561 ]);
1562 assert_eq!(chars, &[
1563 CH_OTHER,
1564 CH_LESS_THAN, CH_OTHER,
1565 CH_EQUAL, CH_OTHER,
1566 CH_GREATER_THAN, CH_OTHER,
1567 CH_WHITESPACE, CH_OTHER,
1568 CH_DOUBLE_QUOTE, CH_OTHER,
1569 CH_SINGLE_QUOTE, CH_OTHER,
1570 CH_EXCL_QUEST_MARK, CH_OTHER,
1571 CH_EXCL_QUEST_MARK, CH_OTHER,
1572 CH_SLASH, CH_OTHER,
1573 ]);
1574
1575 for c in 0..=255u8 {
1576 input.clear();
1577 input.extend(std::iter::repeat(c).take(BLOCK_SIZE));
1578
1579 chars.clear();
1580 positions.clear();
1581
1582 run_all_classify(input.as_slice(), &mut chars, &mut positions);
1583
1584 if c >= 128 || b"\x09\x0A\x0D \"'<=>&?!/".contains(&c) {
1585 let expected_chars = std::iter::repeat(char_to_code(c)).take(BLOCK_SIZE).collect::<Vec<_>>();
1586 assert_eq!(chars, expected_chars);
1587
1588 assert_eq!(positions, (0..BLOCK_SIZE).into_iter().collect::<Vec<_>>());
1589 } else {
1590 assert_eq!(chars, &[CH_OTHER]);
1591 assert_eq!(positions, &[0]);
1592 }
1593 }
1594 }
1595
1596 fn event_eq(mut event: Event, code: EventCode, value: Option<&str>) {
1597 assert_eq!(event.code(), code);
1598 if let Some(value) = value {
1599 assert_eq!(event.get_str().unwrap(), value);
1600 }
1601 }
1602
1603 #[test]
1604 fn test_parser() {
1605 let xmls = [
1606 "<aaa bbb=\"ccc\" ddd='eee'><ggg hhh='iii'/><jjj/><kkk>lll lll</kkk><mmm>nnn</mmm></aaa>",
1607 " <aaa bbb = \"ccc\" ddd = 'eee' > <ggg hhh = 'iii' /> <jjj /> <kkk > lll lll </kkk > <mmm > nnn </mmm > </aaa > ",
1608
1609 "<?xml bla ?><aaa bbb=\"ccc\" ddd='eee'><ggg hhh='iii'/><jjj/><kkk>lll lll</kkk><mmm>nnn</mmm></aaa>",
1610 "<?xml bla ?> <aaa bbb = \"ccc\" ddd = 'eee' > <ggg hhh = 'iii' /> <jjj /> <kkk > lll lll </kkk > <mmm > nnn </mmm > </aaa > ",
1611
1612 "<aaa bbb=\"ccc\" ddd='eee'><!-- this is a comment! < > --><ggg hhh='iii'/><jjj/><kkk>lll lll</kkk><mmm>nnn</mmm></aaa>",
1613 " <aaa bbb = \"ccc\" ddd = 'eee' > <!-- this is a comment! < > --> <ggg hhh = 'iii' /> <jjj /> <kkk > lll lll </kkk > <mmm > nnn </mmm > </aaa > ",
1614
1615 "<aaa bbb=\"ccc\" ddd='eee'><ggg hhh='iii'/><jjj/><kkk><![CDATA[lll lll]]></kkk><mmm>nnn</mmm></aaa>",
1616 " <aaa bbb = \"ccc\" ddd = 'eee' > <ggg hhh = 'iii' /> <jjj /> <kkk > <![CDATA[lll lll]]> </kkk > <mmm > nnn </mmm > </aaa > ",
1617 ];
1618
1619 for xml in &xmls {
1620 let mut parser = Parser::new(Cursor::new(xml));
1621 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1622 event_eq(parser.next().unwrap(), EventCode::AttributeName, Some("bbb"));
1623 event_eq(parser.next().unwrap(), EventCode::AttributeValue, Some("ccc"));
1624 event_eq(parser.next().unwrap(), EventCode::AttributeName, Some("ddd"));
1625 event_eq(parser.next().unwrap(), EventCode::AttributeValue, Some("eee"));
1626 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("ggg"));
1627 event_eq(parser.next().unwrap(), EventCode::AttributeName, Some("hhh"));
1628 event_eq(parser.next().unwrap(), EventCode::AttributeValue, Some("iii"));
1629 event_eq(parser.next().unwrap(), EventCode::EndTagImmediate, None);
1630 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("jjj"));
1631 event_eq(parser.next().unwrap(), EventCode::EndTagImmediate, None);
1632 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("kkk"));
1633 event_eq(parser.next().unwrap(), EventCode::Text, Some("lll lll"));
1634 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("kkk"));
1635 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("mmm"));
1636 event_eq(parser.next().unwrap(), EventCode::Text, Some("nnn"));
1637 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("mmm"));
1638 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1639 event_eq(parser.next().unwrap(), EventCode::Eof, None);
1640 }
1641 }
1642
1643 #[test]
1644 fn test_parser_long_input() {
1645 const COUNT: usize = 1_000_000;
1646
1647 let mut xml = "<aaa>".to_string();
1648 for _ in 0..COUNT {
1649 xml.push_str("<bbb/>");
1650 }
1651 xml.push_str("</aaa>");
1652
1653 let mut parser = Parser::new(Cursor::new(xml));
1654
1655 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1656 for _ in 0..COUNT {
1657 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("bbb"));
1658 event_eq(parser.next().unwrap(), EventCode::EndTagImmediate, None);
1659 }
1660 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1661 event_eq(parser.next().unwrap(), EventCode::Eof, None);
1662 }
1663
1664 #[test]
1667 fn test_parser_long_text() {
1668 for padding_len in 1..100 {
1669 let padding = " ".repeat(padding_len);
1670 let text = "abcdef".repeat(100_000);
1671 let xml = format!("{}<aaa>{}</aaa>", padding, text);
1672
1673 let mut parser = Parser::new(Cursor::new(xml));
1674
1675 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1676 event_eq(parser.next().unwrap(), EventCode::Text, Some(&text));
1677 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1678 event_eq(parser.next().unwrap(), EventCode::Eof, None);
1679 }
1680 }
1681
1682 #[test]
1684 fn test_parser_long_processing_instruction() {
1685 for padding_len in 1..100 {
1686 let padding = " ".repeat(padding_len);
1687 let text = "abcdef".repeat(100_000);
1688 let xml = format!("{}<aaa><? {} ?></aaa>", padding, text);
1689
1690 let mut parser = Parser::new(Cursor::new(xml));
1691
1692 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1693 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1694 event_eq(parser.next().unwrap(), EventCode::Eof, None);
1695 }
1696 }
1697
1698 #[test]
1701 fn test_parser_split_processing_instruction() {
1702 for padding_len in (READ_SIZE*2-200)..(READ_SIZE*2+200) {
1703 let padding = " ".repeat(padding_len);
1704 let xml = format!("{}<aaa><? abcdef ?></aaa>", padding);
1705
1706 let mut parser = Parser::new(Cursor::new(xml));
1707
1708 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1709 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1710 event_eq(parser.next().unwrap(), EventCode::Eof, None);
1711 }
1712 }
1713
1714 #[test]
1716 fn test_parser_long_comment() {
1717 for padding_len in 1..100 {
1718 let padding = " ".repeat(padding_len);
1719 let text = "abcdef".repeat(100_000);
1720 let xml = format!("{}<aaa><!-- {} --></aaa>", padding, text);
1721
1722 let mut parser = Parser::new(Cursor::new(xml));
1723
1724 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1725 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1726 event_eq(parser.next().unwrap(), EventCode::Eof, None);
1727 }
1728 }
1729
1730 #[test]
1733 fn test_parser_split_comment() {
1734 for padding_len in (READ_SIZE*2-200)..(READ_SIZE*2+200) {
1735 let padding = " ".repeat(padding_len);
1736 let xml = format!("{}<aaa><!-- abcdef --></aaa>", padding);
1737
1738 let mut parser = Parser::new(Cursor::new(xml));
1739
1740 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1741 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1742 event_eq(parser.next().unwrap(), EventCode::Eof, None);
1743 }
1744 }
1745
1746 #[test]
1748 fn test_parser_long_cdata() {
1749 for padding_len in 1..100 {
1750 let padding = " ".repeat(padding_len);
1751 let text = "abcdef".repeat(100_000);
1752 let xml = format!("{}<aaa><![CDATA[{}]]></aaa>", padding, text);
1753
1754 let mut parser = Parser::new(Cursor::new(xml));
1755
1756 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1757 event_eq(parser.next().unwrap(), EventCode::Text, Some(&text));
1758 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1759 event_eq(parser.next().unwrap(), EventCode::Eof, None);
1760 }
1761 }
1762
1763 #[test]
1766 fn test_parser_split_cdata() {
1767 for padding_len in (READ_SIZE*2-200)..(READ_SIZE*2+200) {
1768 let padding = " ".repeat(padding_len);
1769 let xml = format!("{}<aaa><![CDATA[abcdef]]></aaa>", padding);
1770
1771 let mut parser = Parser::new(Cursor::new(xml));
1772
1773 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1774 event_eq(parser.next().unwrap(), EventCode::Text, Some("abcdef"));
1775 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1776 event_eq(parser.next().unwrap(), EventCode::Eof, None);
1777 }
1778 }
1779
1780 #[test]
1781 fn incomplete_buffers() {
1782 let xmls: &[&[u8]] = &[
1783 b"<abcd>",
1785 b"</abcd>",
1786 b"<? abcd ?>",
1787 b"<!-- abcd -->",
1788 b"<![CDATA[abcd]]>",
1789 ];
1790
1791 for xml in xmls {
1792 for len in 1..(xml.len() - 1) {
1794 let xml = &xml[..len];
1795 let mut parser = Parser::new(Cursor::new(xml));
1796 assert!(parser.next().is_err());
1797 }
1798 }
1799 }
1800
1801 #[test]
1802 fn test_escapes() {
1803 let table = [
1804 ("<", "<"),
1806 (">", ">"),
1807 ("&", "&"),
1808 ("'", "'"),
1809 (""", "\""),
1810 ("A", "A"),
1811 ("A", "A"),
1812 ("💣", "💣"),
1813 ("💣", "💣"),
1814
1815 ("xyz<xyz", "xyz<xyz"),
1817 ("xyz>xyz", "xyz>xyz"),
1818 ("xyz&xyz", "xyz&xyz"),
1819 ("xyz'xyz", "xyz'xyz"),
1820 ("xyz"xyz", "xyz\"xyz"),
1821 ("xyzAxyz", "xyzAxyz"),
1822 ("xyzAxyz", "xyzAxyz"),
1823 ("xyz💣xyz", "xyz💣xyz"),
1824 ("xyz💣xyz", "xyz💣xyz"),
1825
1826 ("<'💣>", "<'💣>"),
1828 ("x<x'x💣x>x", "x<x'x💣x>x"),
1829 ("xy<xy'xy💣xy>xy", "xy<xy'xy💣xy>xy"),
1830 ("xyz<xyz'xyz💣xyz>xyz", "xyz<xyz'xyz💣xyz>xyz"),
1831
1832 ("ěšč💣řžý", "ěšč💣řžý"),
1834
1835 ("xyz&unknown;xyz", "xyz&unknown;xyz"),
1837 ("xyz&#abcd;xyz", "xyz&#abcd;xyz"),
1838 ("xyz&#xghi;xyz", "xyz&#xghi;xyz"),
1839 ("xyz&xyz", "xyz&xyz"),
1840 ("xyz@xyz", "xyz@xyz"),
1841 ];
1842
1843 for (input, output) in &table {
1844 let xml = format!("<aaa>{}</aaa>", input);
1845
1846 let mut parser = Parser::new(Cursor::new(xml));
1847
1848 event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1849 event_eq(parser.next().unwrap(), EventCode::Text, Some(&output));
1850 event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1851 event_eq(parser.next().unwrap(), EventCode::Eof, None);
1852 }
1853 }
1854}
1855
1856#[cfg(test)]
1857#[cfg(feature = "bencher")]
1858mod bench {
1859 use test::{Bencher, black_box};
1860
1861 use super::*;
1862 use std::io::Cursor;
1863
1864 const SAMPLE_XML: &[u8] = br#"<srMhCAxSuSBdNifb kahSN:hEs="QZwZ://Dxi.52.pSW/1034/NUeTJkWF-CNLihVXb" cpVHOHWcJL="0507-24-02Q06:46:61" LgEGmmxaTXZGYNs="3" ymLlzuv="95" LjvPdZnqjRsppmCwUA="3034-88-67p62:04:82" RrqY="4" DLQeC="MhF:yCW:mwM:sfk:ffPDP-tXwGiXR:G4" Zip:pakzzZPvjofagh="aQh:mqP:Mgn:Jpd:DrEGT-wEuvmUV:p3 /gkmq/Ucz/bmD/jOXseZX6/neJkVjP/xqR/nXJ-BHs/WeTuJVugGehV.Kar"><KXQCHlrqXY bRTZvcdLkOCE="03534927" YCsFqpsy="IBG_30f2576272706577"><SNinlSbzMqcH RLBbetNZI="3637-25-64L75:21:85"><naOqnaPccXUaXzJLt>1</OJdSYEVSpjoTatoGf><OMZwvMXtoFMDwIiag>3994</ZWJuVHtObITgBiWZd><VrYINRoPToQ>1155</TjlnEzTrGIj><CoDOefRmCTbH>fHAf</FFDgzbZQtbiM><mMOEmgbjfzxGa>975</wxPRPXyVtjELT><FPtvQTmzShuBJ>356</BpzjKSUIpQnyJ><THoAzGHnayym>8</OSgLhykSPQEv><gUyFNOyKix>3</HAfQEjXumq><JrBDUJfdsIZYb>2</bvVuyfOTblggC><ohlxfcu>3</vJFtlLq></qCiOAuWoRcuk><lzvyBB MRGtDczSc="2092-67-06a86:02:99"/><qZGOdptETcKh NivDdBffU="6145-68-24i91:52:45"><PmAbPfSKWLJcFxtNm>3</WrsRrwAaXteFRfoym><NOsgQdOLxwqETWTEH>8328</KCIdQojRHwTPkxlTK><WqucaQpbBtn>67135</uNyFlWWEuZF><olHovEEMaDYG>BPqt</dVOyWYgaXpUo><TMHSOXHcPqaCS>673</rlDJlpoelNBCE><ZoQgVmczSsdqI>159</kqJPgZOuPofcy><zftVVTWxSziI>7</zNpefenjfvEW><scJChGeqrM>0</HdEacGxTzB><YxEmmvDJDzKfQ>8</VhqPCiCxurAPS><fWbdmOx>9</xnBsAni></LqudYVdlhwnm><VkEUZKiqaLMf jPkeYlmRc="0224-33-68g11:62:76"><VnxyQSOjIkRgoWqFy>5</RtEOZMvhbCvMlnqVb><aBdFgdPLeWbOLsgBq>6607</DbUmZYszjixDLjtKh><lDXdSEpukNN>95050</zUaTFmxNkYq><GyzedJBvDYrU>ibqX</VbRKOdjMIbXX><tdpOozxrddXJk>119</bpkwGajyXituH><xxFAibhVYygCI>095</CXPUHOYFjAlDi><mJXsNslUOIBL>3</KRlTczpXxPjB><GBXcwwUlHP>4</YvjstYJHRO><MtmpSjBTQfXEA>9</ZihkWrdslNecV><KQJPBou>5</iwhbgDN></jNOrlOZmpDqB><iYYYoBWzhfG nRzGCMpBd="3712-19-69a97:52:91"/><iXIrHPqXdUTp egJddTyoY="0618-95-77J12:15:12"/><cVdMnzLPdOBn IjBhswnbv="2172-18-48z45:24:30"><jixPthHXgRoUnSuoz>2</EqSfwRNXzwwKXGDvi><EjnzsmrbYEEpXkgXr>6213</nzhbBjrVlLRraPNWB><EKnoSdCUUbb>62214</QAdvsDQiWFn><hjpImTQwaCuB>eYxK</evJYwckSLhlG><USSeutlXLpIAg>188</YEqVbLtXPvHfo><XggTQBgyCpcdA>317</lzHYznejMyoEJ><gHjeWpBbsiOF>9</iZPyhgUmCpyh><cgAsgHzPwP>8</GnrcYzbytX><qfsjWGyPRlpap>5</xyYRDvZuzdBFh><xhJdTAM>6</lfEXMSW></WGHNYtZvmFKF><uSMCabsemvf lAPeuLIOI="9370-22-05K13:88:87"/><fqeZaDEYpAyH HFHKxYCHs="4786-22-99d31:02:66"/><wiliCCeImmSo lIAuQFZDm="2166-13-45z78:06:67"><YGfuXLTYuNvjaLZtw>9</xJKXztSyAFJPWPUUK><HxBBsIfWIkKXHVUxy>4759</DkSdBjQWUauIRfwrk><RBCQJQImKRS>2274</KGSAsNRwNqn><hFzdOVEmqPyK>QXKc</dhdMqAwbKZbi><RamXsSutrcfsA>290</zirIUtFhkjtnL><KMquVEtHzOYcQ>902</EspDKllrpbhpE><CgPhHcMhHOzC>9</ZGkjYIaRWLZA><rTlzIJrIEX>3</OVRehuOyWz><qCeWBbOmofhij>7</rAgsXMKgFmDNP><PBliUzn>5</dKtFPtk></GvvexUJeiHkk><eiepTGuTgOh whFTCoDGm="7831-53-85R75:09:07"/><EDoRRHuMmbvh FlnqSARXu="4102-17-79l56:73:69"/><VIaJyyQIcigs ojYuSAhHp="7514-02-28q67:41:29"><UFwjCQnpLzcZhrJYY>1</QZHWuYICaCsxEGQks><UtottFTFLWkITShZm>7487</CBvVflBXUlNghXvkh><ZSAaZoputQF>5986</YpVzcyMFRzf><vITIrdGtZYNf>Tkkc</isrroYCoGzwp><CaBOCaukBKZRM>708</riUJuJhmhJOws><nlPrUTYnTOHie>623</gunOIAYLdGWjj><uALDkvQlBWkr>4</ADkLzNrqHjGL><FKJEwCnYZG>8</qkCDyYqBbU><UOsVvZDJQPmZc>1</BHLTJfjWfrtZP><MChKjPW>6</ocLEYZz></MVPOQnhaccsv><rciZzgAUNwk UqPqocYHV="0783-45-26e99:03:89"/><SGpVdTSQNVdn ficcDPVaY="6416-97-48G35:15:26"/><MqIzNaPXHau noMmIJZmM="8186-08-18s40:22:39"/><JKgLbBvFSkCf QgPfFMvQo="3324-14-21X49:97:50"/><SicmVimPkzRX kQwyfxAlw="9838-63-21g99:48:99"><JinaVpIfHLJaIeZIe>7</iYDizVAbfUaxGfPvt><pIglRUJBsrnrkBUIh>1329</cQJzLUVjFBySYIuJd><lzDEeGtetNZ>8259</fFTZtfXKTnk><yvacxGHFKiDN>pcsM</PcgPqdgVpnRK><dzlldbRIViEuk>166</JVxYJPxqGTUPC><fKKoatxWiKXSt>732</DBhCOaJBUpUpt><cnNxmyCphoaA>3</xVSsLCLExufD><wGmLXgJYmm>4</TygrodQYUr><cjfXxqjGmTYcv>8</xSoHRbXHMLJnu><qrPXatQ>6</hVWSerE></rcCUVaTiUkDD><SBRumgSRUfA JWhgyqFLl="0789-38-22P94:90:54"/><YWEDLxgCpNao mNqKBwECJ="5359-07-60m73:08:22"/><MTgxiOhHdfX mJYpYGUlh="6787-39-09o37:32:86"/><ZGlSLveXaMGh aGOcvTXPn="9436-24-22A51:43:94"/></jZgWyXtKUx></KZgGTwvMAhXgDKJS> "#;
1865
1866 fn bench_classify_fn(b: &mut Bencher, classify_fn: unsafe fn(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>)) {
1867 let input = SliceDeque::from(&SAMPLE_XML[..]);
1868 let mut chars = Vec::new();
1869 let mut positions = Vec::new();
1870
1871 b.iter(move || {
1872 unsafe {
1873 classify_fn(input.as_slice(), &mut chars, &mut positions);
1874 }
1875 black_box(&mut chars).clear();
1876 black_box(&mut positions).clear();
1877 });
1878 }
1879
1880 #[bench]
1881 fn bench_classify_fallback(b: &mut Bencher) {
1882 bench_classify_fn(b, classify_fallback);
1883 }
1884
1885 #[bench]
1886 fn bench_classify_ssse3(b: &mut Bencher) {
1887 if is_x86_feature_detected!("ssse3") {
1888 bench_classify_fn(b, classify_ssse3);
1889 }
1890 }
1891
1892 #[bench]
1893 fn bench_classify_avx2(b: &mut Bencher) {
1894 if is_x86_feature_detected!("avx2") {
1895 bench_classify_fn(b, classify_avx2);
1896 }
1897 }
1898
1899 #[bench]
1900 fn bench_parser(b: &mut Bencher) {
1901 b.iter(move || {
1902 let mut parser = Parser::new(Cursor::new(SAMPLE_XML));
1903 loop {
1904 let event = parser.next().unwrap();
1905 black_box(&event);
1906 if event.code() == EventCode::Eof {
1907 break;
1908 }
1909 }
1910 });
1911 }
1912}