rapid_xml/
parser.rs

1//! Contains low-level XML `Parser`.
2//!
3//! For example XML like this:
4//! ```xml
5//! <tag attribute="value"/><another-tag>text</another-tag>
6//! ```
7//!
8//! Will give following events:
9//!
10//! | code            | text          |
11//! |-----------------|---------------|
12//! | StartTag        | "tag"         |
13//! | AttributeName   | "attribute"   |
14//! | AttributeValue  | "value"       |
15//! | EndTagImmediate |               |
16//! | StartTag        | "another-tag" |
17//! | Text            | "text"        |
18//! | EndTag          | "another-tag" |
19//! | Eof             |               |
20
21// The parsing is a pull-based, whenever the user calls `Parser::next()` and we haven't reached EOF
22// in the underlying `Read`er, the next event is acquired. However, the parsing is done in blocks,
23// keeping a buffer of events for next time. When all events are drained, next block is parsed.
24
25// The parsing of a block is done in these steps:
26// * Data are read from the underlying `Read`er into buffer provided by `SliceDeque`
27//   - The parsing code that follows may not always consume complete buffer, it may leave some
28//     remainder (e.g. the beginning of a tag) and require additional data before it can finish
29//     parsing it. So if we used simple `Vec` for buffer, we would have to either shift this
30//     remainder to its start or keep growing it, both are suboptimal. For optimal solution we need
31//     a ring buffer. `SliceDeque` provides us ring-buffer with continuous views, so we can read
32//     from it and write to it as if it was a slice.
33//   - `SliceDeque` also guarantees that the buffer will be aligned to a page, which is good for our
34//     SIMD classifiers.
35//
36// * The characters in the buffer are classified by their meaning to XML. This gives us list of
37//   their codes (see `CH_*` constants) and list of their positions in the buffer. We call them
38//   control characters (is in characters that control XML parsing, not ascii control characters).
39//   - The control characters are:
40//     * &"'?!/<=>
41//     * Whitespaces
42//     * Any character above 0x80 as marker of possible Utf-8
43//     * First character other than the above when following one of the above
44//   - Basic implementation of the classification is done by `classify_fallback`. But whenever
45//     possible a SIMD version of `classify_*` is used instead. The SIMD versions have exactly the
46//     same output as `classify_fallback`, but are lot faster. Read comments inside them to see how
47//     they work.
48//
49//  * A DFA is run using the character codes as input. It emits a list of `InternalEvent`s
50//    and possibly also errors. See `StateMachine::run`.
51//    - Each input character may contain flags that are ORed into flags for next event.
52//    - Each transition between states contains additional flags that control the emitting of
53//      events.
54//    - The DFA doesn't handle all possible XML constructs, just the most common. (This was done in
55//      order to keep the amount of states and input alphabet small for future SIMD implementation
56//      of the state machine.) If some uncommon XML construct or syntax error appears, an exception
57//      handler is invoked. The exception handler will inspect the buffer and either handle the
58//      uncommon XML construct, or reports error and attempts to recover from it, then it returns
59//      control back to the state machine.
60//
61//  * The `Parser::next()` picks the next `InternalEvent`s and converts it into `Event`s.
62//    - The depending on the kind of `Event`, it may hold a slice of the `Parser`'s internal buffer.
63//      (E.g. holding the name of a tag, value of attribute, text from in between tags, ...)
64//    - The `Event` has flags sets by the DFA on whether the slice contains any XML entities and any
65//      possible Utf-8 characters.
66//      * If there are any XML entities, they are lazily decoded in-place in the `Parser`'s internal
67//        buffer when the user asks for the contained string or byte-slice.
68//      * If there are any possible Utf-8 characters, the Utf-8 encoding is lazily validated when
69//        the user asks for the contained string.
70
71#[cfg(target_arch = "x86_64")]
72use std::arch::x86_64::*;
73use std::collections::VecDeque;
74use std::fmt::Display;
75use std::intrinsics::transmute;
76use std::io::Read;
77use std::ops::Range;
78use std::str::Utf8Error;
79
80use multiversion::multiversion;
81use num_enum::TryFromPrimitive;
82use slice_deque::SliceDeque;
83
84/// Amount of bytes that we attempt to read from underlying Reader
85const READ_SIZE: usize = 4 * 4096; // TODO: Fine-tune.
86
87/// We classify bytes in blocks of this size
88const BLOCK_SIZE: usize = 64; // Size of u64, 64 characters, 4 sse2 128i loads, 2 avx 256i loads.
89
90// Our custom codes for important XML characters / character groups
91const CH_OTHER: u8           = 0x00; // Must be 0 because of how the SIMD algorithm works.
92const CH_OTHER_UTF8: u8      = 0x80; // Same as CH_OTHER, just with extra flag
93const CH_OTHER_AMPERSAND: u8 = 0x40; // Same as CH_OTHER, just with extra flag
94const CH_DOUBLE_QUOTE: u8    = 0x01;
95const CH_SINGLE_QUOTE: u8    = 0x02;
96const CH_WHITESPACE: u8      = 0x03;
97const CH_EXCL_QUEST_MARK: u8 = 0x04;
98const CH_SLASH: u8           = 0x05;
99const CH_LESS_THAN: u8       = 0x06;
100const CH_EQUAL: u8           = 0x07;
101const CH_GREATER_THAN: u8    = 0x08;
102
103
104/// Naive mapping of ascii characters to our codes
105fn char_to_code(c: u8) -> u8 {
106    match c {
107        b'\t' | b'\n' | b'\r' | b' ' => CH_WHITESPACE,
108        b'!' | b'?' => CH_EXCL_QUEST_MARK,
109        b'\"' => CH_DOUBLE_QUOTE,
110        b'\'' => CH_SINGLE_QUOTE,
111        b'&' => CH_OTHER_AMPERSAND,
112        b'/' => CH_SLASH,
113        b'<' => CH_LESS_THAN,
114        b'=' => CH_EQUAL,
115        b'>' => CH_GREATER_THAN,
116        128..=255 => CH_OTHER_UTF8,
117        _ => CH_OTHER,
118    }
119}
120
121/// Fills the `chars` and `positions` vectors with codes and indexes of control characters.
122///
123/// If there was anything in the `chars` or `positions` vector, the new values are appended to it.
124fn classify_fallback(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>) {
125    let mut prev = CH_LESS_THAN; // Any that is not CH_OTHER
126
127    for (i, c) in input.iter().enumerate() {
128        let ch = char_to_code(*c);
129
130        if ch != CH_OTHER || prev != CH_OTHER {
131            chars.push(ch);
132            positions.push(i);
133            prev = ch;
134        }
135    }
136}
137
138/// Same as `classify_fallback`, but implemented using SIMD intrinsics.
139///
140/// # Safety
141///
142/// Can be only called if SSSE3 is available.
143#[cfg(target_arch = "x86_64")]
144#[target_feature(enable = "ssse3")]
145unsafe fn classify_ssse3(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>) {
146    // SIMD classification using two `pshufb` instructions (`_mm_shuffle_epi8` intrinsic) that match
147    // the high and low nibble of the byte. Combining that together we get a non-zero value for every
148    // of the matched characters.
149    //
150    // There are many ways to classify characters using SIMD. This page provides description of
151    // several: http://0x80.pl/articles/simd-byte-lookup.html
152    //
153    // We are using custom algorithm similar to the "Universal" one described on the page linked
154    // above. The key difference is that we not only need to get bytemask/bitmask marking the
155    // characters we are looking for, but also classify them into groups.
156    //
157    // The key to this is the `pshufb` instruction, which uses the bottom 4 bits of each byte in the
158    // SIMD register to index into another SIMD register and writes those to the output register.
159    // We can do this twice, once for the lower 4 bits and second times for the higher 4 bits.
160    //
161    // We can write ASCII table in 16 x 16 table, having the high 4 bits index columns and low 4
162    // bits index the rows. One `pshufb` instruction basically assigns a byte to a column and
163    // another to a row. We can strategically set bits in this byte such that when we AND them
164    // together we will have some bits set only in the characters we care about.
165    //
166    // This in general allows to classify up to 8 groups (8 bits in byte), but we manage to use it
167    // for 9 groups, because one group sits exactly in place where other two groups cross.
168    //
169    //     hi / lo
170    //                   groups                                      characters
171    //         +--------------------------------         +--------------------------------
172    //         | 0 1 2 3 4 5 6 7 8 9 a b c d e f         | 0 1 2 3 4 5 6 7 8 9 a b c d e f
173    //       --+--------------------------------       --+--------------------------------        legend:
174    //       0 | . . . . . . . . . A A . . A . .       0 | . . . . . . . . . S S . . S . .        S = whitespace
175    //       1 | . . . . . . . . . . . . . . . .       1 | . . . . . . . . . . . . . . . .        U = possible Utf-8 character
176    //       2 | B B B . . . C C . . . . . . . D       2 | S ! " . . . & ' . . . . . . . /
177    //       3 | . . . . . . . . . . . . E E E F       3 | . . . . . . . . . . . . < = > ?
178    //       4 | . . . . . . . . . . . . . . . .       4 | . . . . . . . . . . . . . . . .
179    //       5 | . . . . . . . . . . . . . . . .       5 | . . . . . . . . . . . . . . . .
180    //       6 | . . . . . . . . . . . . . . . .       6 | . . . . . . . . . . . . . . . .
181    //       7 | . . . . . . . . . . . . . . . .       7 | . . . . . . . . . . . . . . . .
182    //       8 | G G G G G G G G G G G G G G G G       8 | U U U U U U U U U U U U U U U U
183    //       9 | G G G G G G G G G G G G G G G G       9 | U U U U U U U U U U U U U U U U
184    //       a | G G G G G G G G G G G G G G G G       a | U U U U U U U U U U U U U U U U
185    //       b | G G G G G G G G G G G G G G G G       b | U U U U U U U U U U U U U U U U
186    //       c | G G G G G G G G G G G G G G G G       c | U U U U U U U U U U U U U U U U
187    //       d | G G G G G G G G G G G G G G G G       d | U U U U U U U U U U U U U U U U
188    //       e | G G G G G G G G G G G G G G G G       e | U U U U U U U U U U U U U U U U
189    //       f | G G G G G G G G G G G G G G G G       f | U U U U U U U U U U U U U U U U
190    //
191    // After that we perform a serie of byte-wise operations on the SIMD vector of characters, that
192    // put the values for all control characters in 0-15 range. Part of that is XOR of the
193    // character values with the group values. So the bits selected for each group matter!
194    //
195    // Then we extract a bitmask into u64, where each bit represents whether given position in block
196    // of characters contains a control character. We shift this and OR it with itself to also
197    // select every character following a control character.
198    //
199    // Then for every bit that is set we save the control character value and position to the
200    // provided vectors.
201
202    // Group constants must be a single bit each, they are carefully selected such that we can later
203    // use add their value to the character's ascii code to make the last 4 bits unique.
204    const NOTHING: i8 = 0;
205    const GROUP_A: i8 = 0x10;
206    const GROUP_B: i8 = 0x20;
207    const GROUP_C: i8 = 0x02;
208    const GROUP_D: i8 = 0x06; // We ran out of bits, but this one may use GROUP_C|GROUP_F, because it sits exactly in position where they cross.
209    const GROUP_E: i8 = 0x40;
210    const GROUP_F: i8 = 0x04;
211    const GROUP_G: i8 = 0x08;
212
213    // First we load some constants into registers. These are used repeatedly in the following loop
214    // and should stay in registers until the end.
215    let lo_nibbles_lookup = _mm_setr_epi8(
216        /* column 0 */ GROUP_G | GROUP_B,
217        /* column 1 */ GROUP_G | GROUP_B,
218        /* column 2 */ GROUP_G | GROUP_B,
219        /* column 3 */ GROUP_G,
220        /* column 4 */ GROUP_G,
221        /* column 5 */ GROUP_G,
222        /* column 6 */ GROUP_G | GROUP_C,
223        /* column 7 */ GROUP_G | GROUP_C,
224        /* column 8 */ GROUP_G,
225        /* column 9 */ GROUP_G | GROUP_A,
226        /* column a */ GROUP_G | GROUP_A,
227        /* column b */ GROUP_G,
228        /* column c */ GROUP_G | GROUP_E,
229        /* column d */ GROUP_G | GROUP_A | GROUP_E,
230        /* column e */ GROUP_G | GROUP_E,
231        /* column f */ GROUP_G | GROUP_D | GROUP_F,
232    );
233
234    let hi_nibbles_lookup = _mm_setr_epi8(
235        /* row 0 */ GROUP_A,
236        /* row 1 */ NOTHING,
237        /* row 2 */ GROUP_B | GROUP_C | GROUP_D,
238        /* row 3 */ GROUP_E | GROUP_F,
239        /* row 4 */ NOTHING,
240        /* row 5 */ NOTHING,
241        /* row 6 */ NOTHING,
242        /* row 7 */ NOTHING,
243        /* row 8 */ GROUP_G,
244        /* row 9 */ GROUP_G,
245        /* row a */ GROUP_G,
246        /* row b */ GROUP_G,
247        /* row c */ GROUP_G,
248        /* row d */ GROUP_G,
249        /* row e */ GROUP_G,
250        /* row f */ GROUP_G,
251    );
252
253    let vec_x00 = _mm_set1_epi8(0x00);
254    let vec_x0f = _mm_set1_epi8(0x0f);
255    let vec_x20 = _mm_set1_epi8(0x20);
256    let vec_x80 = _mm_set1_epi8(-128);
257
258    let compact_lookup = _mm_setr_epi8(
259        /* 0 */ CH_WHITESPACE as i8,
260        /* 1 */ CH_EXCL_QUEST_MARK as i8,
261        /* 2 */ CH_DOUBLE_QUOTE as i8,
262        /* 3 */ -1, // Should not be present
263        /* 4 */ CH_OTHER_AMPERSAND as i8,
264        /* 5 */ CH_SINGLE_QUOTE as i8,
265        /* 6 */ -1, // Should not be present
266        /* 7 */ -1, // Should not be present
267        /* 8 */ CH_OTHER_UTF8 as i8,
268        /* 9 */ CH_SLASH as i8,
269        /* a */ -1, // Should not be present
270        /* b */ CH_EXCL_QUEST_MARK as i8,
271        /* c */ CH_LESS_THAN as i8,
272        /* d */ CH_EQUAL as i8,
273        /* e */ CH_GREATER_THAN as i8,
274        /* f */ CH_OTHER as i8, // This doesn't actually matter because whitespaces have the highest bit set and that's why it is mapped to 0, not because of this.
275    );
276
277    // Get the `input` as raw pointers and assert that both start and end are aligned to `BLOCK_SIZE`
278    let mut ptr = input.as_ptr();
279    let end = input.as_ptr().add(input.len());
280    debug_assert!(ptr as usize % BLOCK_SIZE == 0);
281    debug_assert!(end as usize % BLOCK_SIZE == 0);
282
283    let mut offset = 0;
284
285    // This is our piece of memory on stack into which we can do aligned stores and then read back
286    // using non-simd code.
287    #[repr(align(64))]
288    struct ScratchPad([u8; BLOCK_SIZE]);
289    let mut scratchpad = ScratchPad([0; 64]);
290
291    // We don't want to keep state between runs of the classifier, so here we just assume that the
292    // previous block ended in a control character, so we should emit OTHER_CHAR if we don't begin
293    // with control character. If it is not true, we will have two OTHER_CHARs in a row, which is
294    // not a big deal, the state machine can handle that.
295    let mut prev_mask = 1u64 << 63;
296
297    // Here is the main loop. We iterate in BLOCK_SIZE blocks.
298    while ptr < end {
299        // This function handles one SIMD vector within the block. In here we are dealing with sse2,
300        // so SIMD vector is 16 bytes, while BLOCK_SIZE is 64 bytes. So this function will be called
301        // 4 times.
302        let load_vec = |ptr: *const u8, out: *mut u8| -> u64 {
303            // Load input from memory to SIMD register
304            //
305            //   Example: input = ['A', '\t', '\n', '\r', ' ', '!', '"', '\'', '/', '<', '=', '>', '?', '\xea', ..]
306            //    in hex: input = [41, 09, 0a, 0d, 20, 21, 22, 26, 27, 2f, 3c, 3d, 3e, 3f, ea, ..]
307            let input = _mm_load_si128(ptr as *const __m128i); // TODO: Stream load could be good, but it is not available in rust. :-(
308
309            // Map characters into groups using two shuffles
310            //
311            //   Example: groups = [00, 10, 10, 10, 20, 20, 20, 02, 02, 06, 40, 40, 40, 04, 08, ...]
312            let lo_nibbles = _mm_and_si128(input, vec_x0f);
313            let hi_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), vec_x0f);
314            let lo_translated = _mm_shuffle_epi8(lo_nibbles_lookup, lo_nibbles);
315            let hi_translated = _mm_shuffle_epi8(hi_nibbles_lookup, hi_nibbles);
316            let groups = _mm_and_si128(lo_translated, hi_translated);
317
318            // Test for zeroes (i.e. the "other" characters)
319            //
320            //   Example: mask = [ff, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, 00, ...]
321            let mask = _mm_cmpeq_epi8(groups, vec_x00);
322
323            // Squash all characters above 0x80 to 0x80 (Utf-8 characters)
324            //
325            //   Example: input = [41, 09, 0a, 0d, 20, 21, 22, 26, 27, 2f, 3c, 3d, 3e, 3f, 80, ..]
326            let input = _mm_min_epu8(input, vec_x80);
327
328            // Turn all non-matched characters to 0xff
329            //
330            //   Example: input = [ff, 09, 0a, 0d, 20, 21, 22, 26, 27, 2f, 3c, 3d, 3e, 3f, 80, ..]
331            let input = _mm_or_si128(input, mask);
332
333            // Squash all characters <= 0x20 to 0 (now whitespace characters only) using saturating
334            // subtraction.
335            //
336            //   Example: input = [df, 00, 00, 00, 00, 01, 02, 06, 07, 0f, 1c, 1d, 1e, 1f, 60, ..]
337            let input = _mm_subs_epu8(input, vec_x20);
338
339            // Xor groups into the input. Thanks to carefully selected group numbers, this will
340            // make sure that every character of interest gets unique last 4 bits.
341            //
342            //   Example: input = [df, 10, 10, 10, 20, 21, 22, 04, 05, 09, 5c, 5d, 5e, 1b, 68, ..]
343            //     Only last 4 bits really matter, unless the highest bit is set
344            //                    [df, _0, _0, _0, _0, _1, _2, _4, _5, _9, _c, _d, _e, _b, _8, ..]
345            let input = _mm_xor_si128(input, groups);
346
347            // Now we have unique last 4 bits and the highest bit is not set for any but CH_OTHER.
348            // We use another shuffle to map these values (which are all over the place in the 0..F
349            // range) into a compacted list.
350            // Note that we even have distinct value for '!' and '?', but we map them into the same
351            // value here to save space in the DFA.
352            //
353            //   Example: input = [00, 01, 01, 01, 01, 02, 03, 04, 06, 07, 08, 09, 02, 0a, ..]
354            let input = _mm_shuffle_epi8(compact_lookup, input);
355
356            // Store it to our properly aligned scratchpad
357            _mm_store_si128(out as *mut __m128i, input);
358
359            // Return the mask as bit mask
360            (_mm_movemask_epi8(mask) as u16) as u64
361        };
362
363        // We load the block as 4x SSE2 vector and combine the bits of the mask into one u64 mask
364        let mask_a = load_vec(ptr,         &mut scratchpad.0[0]  as *mut u8);
365        let mask_b = load_vec(ptr.add(16), &mut scratchpad.0[16] as *mut u8);
366        let mask_c = load_vec(ptr.add(32), &mut scratchpad.0[32] as *mut u8);
367        let mask_d = load_vec(ptr.add(48), &mut scratchpad.0[48] as *mut u8);
368
369        let mut mask = !(mask_a | (mask_b << 16) | (mask_c << 32) | (mask_d << 48));
370
371        // Shift the mask such that we take one non-control character after each block of control
372        // characters. Remember the mask for next iteration.
373        let tmp = mask;
374        mask = mask | (mask << 1) | (prev_mask >> 63);
375        prev_mask = tmp;
376
377        // If we found any control characters in this block
378        if mask != 0 {
379            let count = mask.count_ones() as usize;
380
381            // Reserve enough space in case the whole block was filled with control characters
382            chars.reserve(BLOCK_SIZE); // Max amount of control characters we may find.
383            positions.reserve(BLOCK_SIZE); // Max amount of control characters we may find.
384            let mut chars_ptr = chars.as_mut_ptr().add(chars.len());
385            let mut positions_ptr = positions.as_mut_ptr().add(positions.len());
386
387            // Write them out.
388            'outer: loop {
389                // To reduce the amount of conditional jumps, we always write out the next 4
390                // characters, even if there is less than 4 characters left. In that case, some of
391                // them are incorrect, but they will be ignored because at the end we set the length
392                // to the actual total amount of found characters.
393
394                // TODO: Verify that this is unrolled
395                // TODO: Fine-tune the amount of repetitions
396                for _ in 0..4 {
397                    let index = mask.trailing_zeros() as usize;
398
399                    let ch = *scratchpad.0.get_unchecked(index); // Safety: u64 has max 63 trailing zeroes, so that will never overflow
400                    let pos = index + offset;
401
402                    *chars_ptr = ch;
403                    chars_ptr = chars_ptr.add(1);
404                    *positions_ptr = pos;
405                    positions_ptr = positions_ptr.add(1);
406
407                    mask = mask & mask.wrapping_sub(1);
408                    if mask == 0 {
409                        break 'outer;
410                    }
411                }
412            }
413
414            chars.set_len(chars.len() + count);
415            positions.set_len(positions.len() + count);
416        }
417
418        ptr = ptr.add(BLOCK_SIZE);
419        offset += BLOCK_SIZE;
420    }
421}
422
423/// Same as `classify_ssse3`, but using AVX2 instructions.
424///
425/// # Safety
426///
427/// Can be only called if AVX2 is available.
428#[cfg(target_arch = "x86_64")]
429#[target_feature(enable = "avx2")]
430unsafe fn classify_avx2(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>) {
431    // Same algorithm as classify_ssse3, but working on half the amount of twice as long vectors.
432
433    const NOTHING: i8 = 0;
434    const GROUP_A: i8 = 0x10;
435    const GROUP_B: i8 = 0x20;
436    const GROUP_C: i8 = 0x02;
437    const GROUP_D: i8 = 0x06;
438    const GROUP_E: i8 = 0x40;
439    const GROUP_F: i8 = 0x04;
440    const GROUP_G: i8 = 0x08;
441
442    let lo_nibbles_lookup = _mm256_setr_epi8(
443        /* 0 */ GROUP_G | GROUP_B,
444        /* 1 */ GROUP_G | GROUP_B,
445        /* 2 */ GROUP_G | GROUP_B,
446        /* 3 */ GROUP_G,
447        /* 4 */ GROUP_G,
448        /* 5 */ GROUP_G,
449        /* 6 */ GROUP_G | GROUP_C,
450        /* 7 */ GROUP_G | GROUP_C,
451        /* 8 */ GROUP_G,
452        /* 9 */ GROUP_G | GROUP_A,
453        /* a */ GROUP_G | GROUP_A,
454        /* b */ GROUP_G,
455        /* c */ GROUP_G | GROUP_E,
456        /* d */ GROUP_G | GROUP_A | GROUP_E,
457        /* e */ GROUP_G | GROUP_E,
458        /* f */ GROUP_G | GROUP_D | GROUP_F,
459
460        /* 0 */ GROUP_G | GROUP_B,
461        /* 1 */ GROUP_G | GROUP_B,
462        /* 2 */ GROUP_G | GROUP_B,
463        /* 3 */ GROUP_G,
464        /* 4 */ GROUP_G,
465        /* 5 */ GROUP_G,
466        /* 6 */ GROUP_G | GROUP_C,
467        /* 7 */ GROUP_G | GROUP_C,
468        /* 8 */ GROUP_G,
469        /* 9 */ GROUP_G | GROUP_A,
470        /* a */ GROUP_G | GROUP_A,
471        /* b */ GROUP_G,
472        /* c */ GROUP_G | GROUP_E,
473        /* d */ GROUP_G | GROUP_A | GROUP_E,
474        /* e */ GROUP_G | GROUP_E,
475        /* f */ GROUP_G | GROUP_D | GROUP_F,
476    );
477
478    let hi_nibbles_lookup = _mm256_setr_epi8(
479        /* 0 */ GROUP_A,
480        /* 1 */ NOTHING,
481        /* 2 */ GROUP_B | GROUP_C | GROUP_D,
482        /* 3 */ GROUP_E | GROUP_F,
483        /* 4 */ NOTHING,
484        /* 5 */ NOTHING,
485        /* 6 */ NOTHING,
486        /* 7 */ NOTHING,
487        /* 8 */ GROUP_G,
488        /* 9 */ GROUP_G,
489        /* a */ GROUP_G,
490        /* b */ GROUP_G,
491        /* c */ GROUP_G,
492        /* d */ GROUP_G,
493        /* e */ GROUP_G,
494        /* f */ GROUP_G,
495
496        /* 0 */ GROUP_A,
497        /* 1 */ NOTHING,
498        /* 2 */ GROUP_B | GROUP_C | GROUP_D,
499        /* 3 */ GROUP_E | GROUP_F,
500        /* 4 */ NOTHING,
501        /* 5 */ NOTHING,
502        /* 6 */ NOTHING,
503        /* 7 */ NOTHING,
504        /* 8 */ GROUP_G,
505        /* 9 */ GROUP_G,
506        /* a */ GROUP_G,
507        /* b */ GROUP_G,
508        /* c */ GROUP_G,
509        /* d */ GROUP_G,
510        /* e */ GROUP_G,
511        /* f */ GROUP_G,
512    );
513
514    let vec_x00 = _mm256_set1_epi8(0x00);
515    let vec_x0f = _mm256_set1_epi8(0x0f);
516    let vec_x20 = _mm256_set1_epi8(0x20);
517    let vec_x80 = _mm256_set1_epi8(-128);
518
519    let compact_lookup = _mm256_setr_epi8(
520        /* 0 */ CH_WHITESPACE as i8,
521        /* 1 */ CH_EXCL_QUEST_MARK as i8,
522        /* 2 */ CH_DOUBLE_QUOTE as i8,
523        /* 3 */ -1, // Should not be present
524        /* 4 */ CH_OTHER_AMPERSAND as i8,
525        /* 5 */ CH_SINGLE_QUOTE as i8,
526        /* 6 */ -1, // Should not be present
527        /* 7 */ -1, // Should not be present
528        /* 8 */ CH_OTHER_UTF8 as i8,
529        /* 9 */ CH_SLASH as i8,
530        /* a */ -1, // Should not be present
531        /* b */ CH_EXCL_QUEST_MARK as i8,
532        /* c */ CH_LESS_THAN as i8,
533        /* d */ CH_EQUAL as i8,
534        /* e */ CH_GREATER_THAN as i8,
535        /* f */ CH_OTHER as i8,
536
537        /* 0 */ CH_WHITESPACE as i8,
538        /* 1 */ CH_EXCL_QUEST_MARK as i8,
539        /* 2 */ CH_DOUBLE_QUOTE as i8,
540        /* 3 */ -1, // Should not be present
541        /* 4 */ CH_OTHER_AMPERSAND as i8,
542        /* 5 */ CH_SINGLE_QUOTE as i8,
543        /* 6 */ -1, // Should not be present
544        /* 7 */ -1, // Should not be present
545        /* 8 */ CH_OTHER_UTF8 as i8,
546        /* 9 */ CH_SLASH as i8,
547        /* a */ -1, // Should not be present
548        /* b */ CH_EXCL_QUEST_MARK as i8,
549        /* c */ CH_LESS_THAN as i8,
550        /* d */ CH_EQUAL as i8,
551        /* e */ CH_GREATER_THAN as i8,
552        /* f */ CH_OTHER as i8,
553    );
554
555    let mut ptr = input.as_ptr();
556    let end = input.as_ptr().add(input.len());
557    debug_assert!(ptr as usize % BLOCK_SIZE == 0);
558    debug_assert!(end as usize % BLOCK_SIZE == 0);
559
560    let mut offset = 0;
561
562    #[repr(align(64))]
563    struct ScratchPad([u8; BLOCK_SIZE]);
564    let mut scratchpad = ScratchPad([0; 64]);
565
566    let mut prev_mask = 1u64 << 63;
567
568    while ptr < end {
569        let mask_a = {
570            let input = _mm256_load_si256(ptr as *const __m256i); // TODO: Stream load could be good, but it is not available in rust. :-(
571
572            let lo_nibbles = _mm256_and_si256(input, vec_x0f);
573            let hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(input, 4), vec_x0f);
574            let lo_translated = _mm256_shuffle_epi8(lo_nibbles_lookup, lo_nibbles);
575            let hi_translated = _mm256_shuffle_epi8(hi_nibbles_lookup, hi_nibbles);
576            let groups = _mm256_and_si256(lo_translated, hi_translated);
577
578            let mask = _mm256_cmpeq_epi8(groups, vec_x00);
579
580            let input = _mm256_min_epu8(input, vec_x80);
581            let input = _mm256_or_si256(input, mask);
582            let input = _mm256_subs_epu8(input, vec_x20);
583            let input = _mm256_xor_si256(input, groups);
584            let input = _mm256_shuffle_epi8(compact_lookup, input);
585
586            _mm256_store_si256(&mut scratchpad.0[0] as *mut u8 as *mut __m256i, input);
587
588            (_mm256_movemask_epi8(mask) as u32) as u64
589        };
590
591        let mask_b = {
592            let input = _mm256_load_si256(ptr.add(32) as *const __m256i); // TODO: Stream load could be good, but it is not available in rust. :-(
593
594            let lo_nibbles = _mm256_and_si256(input, vec_x0f);
595            let hi_nibbles = _mm256_and_si256(_mm256_srli_epi16(input, 4), vec_x0f);
596            let lo_translated = _mm256_shuffle_epi8(lo_nibbles_lookup, lo_nibbles);
597            let hi_translated = _mm256_shuffle_epi8(hi_nibbles_lookup, hi_nibbles);
598            let groups = _mm256_and_si256(lo_translated, hi_translated);
599
600            let mask = _mm256_cmpeq_epi8(groups, vec_x00);
601
602            let input = _mm256_min_epu8(input, vec_x80);
603            let input = _mm256_or_si256(input, mask);
604            let input = _mm256_subs_epu8(input, vec_x20);
605            let input = _mm256_xor_si256(input, groups);
606            let input = _mm256_shuffle_epi8(compact_lookup, input);
607
608            _mm256_store_si256(&mut scratchpad.0[32] as *mut u8 as *mut __m256i, input);
609
610            (_mm256_movemask_epi8(mask) as u32) as u64
611        };
612
613        let mut mask = !(mask_a | (mask_b << 32));
614
615        let tmp = mask;
616        mask = mask | (mask << 1) | (prev_mask >> 63);
617        prev_mask = tmp;
618
619        if mask != 0 {
620            let count = mask.count_ones() as usize;
621
622            chars.reserve(BLOCK_SIZE); // Max amount of control characters we may find.
623            positions.reserve(BLOCK_SIZE); // Max amount of control characters we may find.
624            let mut chars_ptr = chars.as_mut_ptr().add(chars.len());
625            let mut positions_ptr = positions.as_mut_ptr().add(positions.len());
626
627            'outer: loop {
628                // TODO: Verify that this is unrolled
629                // TODO: Fine-tune the amount of repetitions
630                for _ in 0..4 {
631                    let index = mask.trailing_zeros() as usize;
632
633                    let ch = *scratchpad.0.get_unchecked(index); // Safety: u64 has max 63 trailing zeroes, so that will never overflow
634                    let pos = index + offset;
635
636                    *chars_ptr = ch;
637                    chars_ptr = chars_ptr.add(1);
638                    *positions_ptr = pos;
639                    positions_ptr = positions_ptr.add(1);
640
641                    mask = mask & mask.wrapping_sub(1);
642                    if mask == 0 {
643                        break 'outer;
644                    }
645                }
646            }
647
648            chars.set_len(chars.len() + count);
649            positions.set_len(positions.len() + count);
650        }
651
652        ptr = ptr.add(BLOCK_SIZE);
653        offset += BLOCK_SIZE;
654    }
655}
656
657#[multiversion]
658#[specialize(target = "[x86|x86_64]+avx2", fn = "classify_avx2", unsafe = true)]
659#[specialize(target = "[x86|x86_64]+ssse3", fn = "classify_ssse3", unsafe = true)]
660// TODO: AVX512 should be possible, even with some additional improvements using scather instructions,
661//       but AVX512 is only partially implemented in nightly and not at all in stable rust.
662fn classify(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>) {
663    classify_fallback(input, chars, positions)
664}
665
666/// If this bit is set in a transition, it means that event should be emitted
667const BIT_EMIT: u8 = 0b1000_0000;
668
669// If this bit is set in a transition, it means the current position should be saved as
670// the start of future event (if any).
671const BIT_SAVE_START: u8 = 0b0100_0000;
672
673// If this bit is set in a transition, it means the current position should be saved as
674// the end of future event (if any).
675const BIT_SAVE_END: u8 = 0b0010_0000;
676
677// This table is generated by the `build_dfa` binary. It maps from current state and control
678// character into new state and flags.
679const PARSE_DFA: [u8; 19 * 9] = [
680/*             00    01    02    03    04    05    06    07    08 */
681/*   00:  */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
682/*   01:  */ 0x01, 0x00, 0x00, 0xaa, 0x00, 0xa3, 0x00, 0x00, 0xa7,
683/*   02:  */ 0x02, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00, 0x00, 0xa7,
684/*   03:  */ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x87,
685/*   04:  */ 0x04, 0x04, 0x04, 0x2c, 0x04, 0x04, 0xa8, 0x04, 0x04,
686/*   05:  */ 0x05, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0xaf, 0x00,
687/*   06:  */ 0x06, 0xb2, 0x06, 0x06, 0x06, 0x06, 0x00, 0x06, 0x06,
688/*   07:  */ 0x44, 0x44, 0x44, 0x07, 0x44, 0x44, 0x08, 0x44, 0x44,
689/*   08:  */ 0x41, 0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,
690/*   09:  */ 0x42, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
691/*   0a:  */ 0x45, 0x00, 0x00, 0x0a, 0x00, 0x03, 0x00, 0x00, 0x07,
692/*   0b:  */ 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x00, 0x07,
693/*   0c:  */ 0x04, 0x04, 0x04, 0x0c, 0x04, 0x04, 0x88, 0x04, 0x04,
694/*   0d:  */ 0x00, 0x00, 0x00, 0x0d, 0x00, 0x00, 0x00, 0x0f, 0x00,
695/*   0e:  */ 0x0e, 0x0e, 0xb2, 0x0e, 0x0e, 0x0e, 0x00, 0x0e, 0x0e,
696/*   0f:  */ 0x00, 0x10, 0x11, 0x0f, 0x00, 0x00, 0x00, 0x00, 0x00,
697/*   10:  */ 0x46, 0x00, 0x46, 0x46, 0x46, 0x46, 0x00, 0x46, 0x46,
698/*   11:  */ 0x4e, 0x4e, 0x00, 0x4e, 0x4e, 0x4e, 0x00, 0x4e, 0x4e,
699/*   12:  */ 0x00, 0x00, 0x00, 0x0a, 0x00, 0x03, 0x00, 0x00, 0x07,
700];
701
702// If this bit is set in event code, the string of the event contains UTF-8 characters. (Actually
703// non-ascii characters that must be validated as valid UTF-8.)
704// The same bit is set in the code of the character that represents non-ascii character.
705const BIT_HAS_UTF8: u8 =    0b1000_0000;
706
707// If this bit is set in event code, the string of the event contains XML escapes.
708// The same bit is set in the code of ampersand.
709const BIT_HAS_ESCAPES: u8 = 0b0100_0000;
710
711/// Represents the type of the `Event`.
712#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
713#[repr(u8)]
714pub enum EventCode {
715    // 0 is never reported as enum EventCode, but it is used in u8 representation to report error.
716
717    /// Start of an opening tag which may or may not have attributes and may be immediately closed.
718    /// Corresponds to the `<abcd` part. The contained string is `abcd`.
719    StartTag =        0o1,
720
721    /// Closing tag. Corresponds to the `</abcd>` part. The contained string is `abcd`.
722    EndTag =          0o2,
723
724    /// Immediately closed tag. Corresponds to the `/>` part. Never contains any string.
725    EndTagImmediate = 0o3,
726
727    /// Text between tags.
728    Text =            0o4,
729
730    /// Name of an attribute - the part before `=`. In well-formed XML, it is always followed by
731    /// `AttributeValue`. The contained string is the attribute name.
732    AttributeName =   0o5,
733
734    /// Value of an attribute - the part after `=`. In well-formed XML, it is always preceeded by
735    /// `AttributeName`. The contained string is the attribute value.
736    AttributeValue =  0o6,
737
738    /// End of file. Never contains any string.
739    Eof =             0o7,
740}
741
742// States are chosen in such way, that the lower 3 bits contain the code of the event that would be
743// emitted in that state, if that state can ever emit an event.
744#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord, TryFromPrimitive)]
745#[repr(u8)]
746enum State {
747    /// Exception is a state that is not handled by DFA. It signifies either error in XML syntax, or
748    /// a XML construct that must be handled by specially and is not covered by the DFA.
749    Exception = 0,
750
751    /// State outside of any tag or text. This is the starting state.
752    Outside = EventCode::Eof as u8, // emits EventCode::Eof
753
754    TagStart = 0o10,
755    TagEnd = 0o11,
756
757    TagName = EventCode::StartTag as u8, // emits EventCode::StartTag
758    TagEndName = EventCode::EndTag as u8, // emits EventCode::EndTag
759
760    InTag = 0o12,
761    InTagEnd = 0o13,
762
763    AttrName = EventCode::AttributeName as u8, // emits EventCode::AttributeName
764
765    AfterAttrName = 0o14, // Space between attribute name and '=' sign
766
767    AfterAttrEq = 0o15,
768
769    AttrValueDoubleQuotedOpen = 0o21,
770    AttrValueDoubleQuoted = EventCode::AttributeValue as u8, // emits EventCode::AttributeValue
771
772    AttrValueSingleQuotedOpen = 0o20,
773    AttrValueSingleQuoted = EventCode::AttributeValue as u8 | 0o10, // emits EventCode::AttributeValue
774
775    AfterAttrValue = 0o17,
776
777    AfterImmediateEndTag = EventCode::EndTagImmediate as u8, // emits EventCode::EndTagImmediate
778
779    InText = EventCode::Text as u8, // emits EventCode::Text
780    InTextEndWhitespace = 0o22,
781
782    // Exception states. These states are set by exception handler, if the buffer did not contain
783    // enough data to handle the exception. (E.g. comment started in current buffer but didn't end
784    // in it.)
785
786    HandledException = 0o100,
787    InProcessingInstruction = 0o101,
788    InCommentOrCDATA = 0o102,
789    InComment = 0o103,
790    InCDATA = 0o104,
791}
792
793#[derive(Clone, Debug)]
794struct ParserState {
795    state: State,
796    flags: u8,
797    start_position: usize,
798    end_position: usize,
799    exception_up_to: usize,
800}
801
802impl Default for ParserState {
803    fn default() -> Self {
804        Self {
805            state: State::Outside,
806            flags: 0,
807            start_position: 0,
808            end_position: 0,
809            exception_up_to: 0,
810        }
811    }
812}
813
814#[derive(Debug)]
815struct StateMachine<'e, 'state> {
816    events: &'e mut VecDeque<InternalEvent>,
817    errors: &'e mut VecDeque<MalformedXmlError>,
818    state: &'state mut ParserState,
819    position_offset: usize,
820}
821
822impl<'e, 'state> StateMachine<'e, 'state> {
823    fn run(&mut self, chars: &[u8], positions: &[usize], buffer: &[u8]) {
824        // If we were inside exception handler when we ended last time, we must continue with it.
825        if self.state.state as u8 > State::HandledException as u8 {
826            self.continue_exception(buffer);
827
828            if self.state.state as u8 > State::HandledException as u8 {
829                // If the exception handler didn't change the state, we need more data in the buffer.
830                return;
831            }
832        }
833
834        let mut s = (*self.state).clone();
835
836        // Go over all control characters and their positions
837        for (ch, pos) in chars.iter().copied().zip(positions.iter().copied()) {
838            let out_pos = pos + self.position_offset;
839
840            if out_pos < s.exception_up_to {
841                // We are still receiving characters that are in area that was handled by exception
842                // handler.
843                continue;
844            }
845
846            // First we OR-in the flags from the characters (e.g. has-utf8, has-escapes).
847            // These are always additive. The lower bits from the characters get ORed too, it doesn't
848            // matter.
849            s.flags |= ch;
850
851            let dfa_index = s.state as u8 * 9 + (ch & 0b1111);
852            let transition = unsafe { *PARSE_DFA.get_unchecked(dfa_index as usize) };
853
854            if transition == State::Exception as u8 {
855                *self.state = s;
856                self.handle_exception(out_pos, buffer);
857                s = (*self.state).clone();
858
859                if s.state as u8 > State::HandledException as u8 {
860                    // If we are still in exception state after running the exception handler, then
861                    // we need more data in the buffer to run the handler.
862                    return;
863                } else {
864                    continue;
865                }
866            }
867
868            if transition & BIT_SAVE_END != 0 {
869                s.end_position = out_pos;
870            }
871
872            if transition & BIT_EMIT != 0 {
873                debug_assert!(s.state as u8 & 0b1100_0000 == 0, "The highest two bits in State are set now? We must mask them away before ORing with flags.");
874                let code = s.state as u8 | (s.flags & 0b1100_0000);
875                self.events.push_back(InternalEvent {
876                    start: s.start_position,
877                    end: s.end_position,
878                    code,
879                });
880                s.flags = 0;
881            }
882
883            if transition & BIT_SAVE_START != 0 {
884                s.start_position = out_pos;
885            }
886
887            s.state = match transition & 0b0001_1111 {
888                #[cfg(debug_assertions)]
889                state_num => State::try_from(state_num).unwrap(),
890
891                #[cfg(not(debug_assertions))]
892                state_num => unsafe { std::mem::transmute::<u8, State>(state_num) },
893            };
894        }
895
896        *self.state = s;
897    }
898
899    // The DFA covers only the most usual states in the XML state machine. Anything else will fall
900    // into exception state from which we have to manually recover. Exception may be either error
901    // in XML syntax or some less usual XML construct.
902    #[cold]
903    fn handle_exception(&mut self, pos: usize, buffer: &[u8]) {
904        match (self.state.state, buffer[pos]) {
905            (State::TagStart, b'?') =>
906                self.handle_processing_instruction(pos, buffer),
907
908            (State::TagStart, b'!') =>
909                // Note that it looks like we could inspect the `buffer` to see whether this is
910                // a comment or CDATA and in most cases it is true, but not always... If the "<!"
911                // is right at the end of the buffer, we don't know. The recognition of comment vs
912                // CDATA must happen in handler that can be restarted once more data arrive.
913                self.handle_comment_or_cdata(pos, buffer),
914
915            _ =>
916                self.handle_error(pos, buffer),
917        }
918    }
919
920    #[cold]
921    fn continue_exception(&mut self, buffer: &[u8]) {
922        match self.state.state {
923            State::InProcessingInstruction =>
924                self.continue_processing_instruction(buffer),
925
926            State::InCommentOrCDATA =>
927                self.continue_comment_or_cdata(buffer),
928
929            State::InComment =>
930                self.continue_comment(buffer),
931
932            State::InCDATA =>
933                self.continue_cdata(buffer),
934
935            _ =>
936                unreachable!("continue_exception was called with state {:?}", self.state.state),
937        }
938    }
939
940    fn handle_processing_instruction(&mut self, pos: usize, buffer: &[u8]) {
941        self.state.start_position = pos + 1;
942        self.state.state = State::InProcessingInstruction;
943
944        self.continue_processing_instruction(buffer);
945    }
946
947    fn continue_processing_instruction(&mut self, buffer: &[u8]) {
948        // Currently we ignore the content and just skip at its end.
949        // TODO: Should we handle them anyhow? We could at least emit their content as event...
950
951        debug_assert_eq!(self.state.state, State::InProcessingInstruction);
952
953        let pi_start = self.state.start_position;
954        let pi_end = twoway::find_bytes(&buffer[pi_start..], b"?>");
955        if let Some(pi_end) = pi_end {
956            let pi_end = pi_end + pi_start;
957            self.state.exception_up_to = pi_end + 2;
958            self.state.state = State::Outside;
959            self.state.flags = 0;
960        } else {
961            // The state will remain State::InProcessingInstruction, so once we have new longer buffer, we will be called again.
962        }
963    }
964
965    fn handle_comment_or_cdata(&mut self, pos: usize, buffer: &[u8]) {
966        self.state.start_position = pos + 1;
967        self.state.state = State::InCommentOrCDATA;
968
969        self.continue_comment_or_cdata(buffer);
970    }
971
972    fn continue_comment_or_cdata(&mut self, buffer: &[u8]) {
973        debug_assert_eq!(self.state.state, State::InCommentOrCDATA);
974        let pos = self.state.start_position;
975
976        let buffer_slice = &buffer[pos..];
977
978        if buffer_slice.starts_with(b"--") {
979            self.state.start_position = pos + 2;
980            self.state.state = State::InComment;
981            return self.continue_comment(buffer);
982        }
983
984        if buffer_slice.starts_with(b"[CDATA[") {
985            self.state.start_position = pos + 7;
986            self.state.state = State::InCDATA;
987            return self.continue_cdata(buffer);
988        }
989
990        if buffer_slice.len() < 2 && b"--".starts_with(buffer_slice) {
991            // It could be a comment, but we need more data...
992        } else if buffer_slice.len() < 7 && b"[CDATA[".starts_with(buffer_slice) {
993            // It could be a CDATA, we need more data...
994        } else {
995            // We can already tell that it is not comment nor cdata
996            self.handle_error(pos, buffer);
997        }
998    }
999
1000    fn continue_comment(&mut self, buffer: &[u8]) {
1001        // Currently we skip comments and just skip at its end.
1002        // TODO: Should we handle them anyhow? We could at least emit their content as event...
1003
1004        debug_assert_eq!(self.state.state, State::InComment);
1005
1006        let comment_start = self.state.start_position;
1007        let comment_end = twoway::find_bytes(&buffer[comment_start..], b"-->");
1008        if let Some(comment_end) = comment_end {
1009            let comment_end = comment_end + comment_start;
1010            self.state.exception_up_to = comment_end + 3;
1011            self.state.state = State::Outside;
1012            self.state.flags = 0;
1013        } else {
1014            // The state will remain State::InComment, so once we have new longer buffer, we will be called again.
1015        }
1016    }
1017
1018    fn continue_cdata(&mut self, buffer: &[u8]) {
1019        // TODO: Any specialities we should care about in CDATA?
1020
1021        debug_assert_eq!(self.state.state, State::InCDATA);
1022
1023        let cdata_start = self.state.start_position;
1024        let cdata_end = twoway::find_bytes(&buffer[cdata_start..], b"]]>");
1025        if let Some(cdata_end) = cdata_end {
1026            let cdata_end = cdata_end + cdata_start;
1027
1028            let code = EventCode::Text as u8 | BIT_HAS_UTF8; // We conservatively assume that Utf-8 characters were present.
1029            self.events.push_back(InternalEvent {
1030                start: cdata_start,
1031                end: cdata_end,
1032                code,
1033            });
1034
1035            self.state.exception_up_to = cdata_end + 3;
1036            self.state.state = State::Outside;
1037            self.state.flags = 0;
1038        } else {
1039            // The state will remain State::InCDATA, so once we have new longer buffer, we will be called again.
1040        }
1041    }
1042
1043    fn handle_error(&mut self, pos: usize, buffer: &[u8]) {
1044        // Push error event and corresponding event
1045        self.events.push_back(InternalEvent {
1046            start: pos,
1047            end: pos,
1048            code: 0,
1049        });
1050
1051        let context_slice = pos.saturating_sub(10)..(pos + 10).min(buffer.len());
1052        self.errors.push_back(MalformedXmlError {
1053            kind: MalformedXMLKind::Other,
1054            context: Some(String::from_utf8_lossy(&buffer[context_slice]).into_owned()),
1055        });
1056
1057        // Attempt to recover
1058        // We find the closest '>' and resume behind as if we just left tag. This may be incorrect,
1059        // the '>' character may be inside of a comment or cdata. But in that case we will just
1060        // report more errors until we eventually recover.
1061        let continue_from = twoway::find_bytes(&buffer[(pos+1)..], b">").unwrap_or(0);
1062        self.state.exception_up_to = pos + 1 + continue_from;
1063        self.state.state = State::Outside;
1064        self.state.flags = 0;
1065        // TODO: Smarter recoveries?
1066    }
1067}
1068
1069/// The kind of XML malformation that was encountered
1070#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
1071pub enum MalformedXMLKind {
1072    /// EOF before the XML structure was properly terminated
1073    UnexpectedEof,
1074
1075    /// Any other problem
1076    Other,
1077}
1078
1079/// XML was not well-formed (syntax error)
1080#[derive(Clone, Debug)]
1081pub struct MalformedXmlError {
1082    /// Kind of malformation
1083    kind: MalformedXMLKind,
1084
1085    /// Text around the problem
1086    context: Option<String>,
1087}
1088
1089impl std::fmt::Display for MalformedXmlError {
1090    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
1091        write!(f, "Malformed XML")?;
1092
1093        if self.kind != MalformedXMLKind::Other {
1094            let msg = match self.kind {
1095                MalformedXMLKind::UnexpectedEof => "unexpected EOF",
1096                MalformedXMLKind::Other => "",
1097            };
1098
1099            write!(f, " ({})", msg)?;
1100        }
1101
1102        if let Some(context) = &self.context {
1103            write!(f, ": {:?}", context)?;
1104        }
1105
1106        Ok(())
1107    }
1108}
1109
1110impl std::error::Error for MalformedXmlError {}
1111
1112/// Error while parsing
1113#[derive(Debug)]
1114pub enum ParseError {
1115    /// IO error reading from the underlying Read
1116    IO(std::io::Error),
1117
1118    /// XML was not well formed
1119    MalformedXML(MalformedXmlError),
1120}
1121
1122impl std::fmt::Display for ParseError {
1123    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
1124        match self {
1125            ParseError::IO(err) => Display::fmt(err, f),
1126            ParseError::MalformedXML(err) => Display::fmt(err, f),
1127        }
1128    }
1129}
1130
1131impl std::error::Error for ParseError {}
1132
1133impl From<std::io::Error> for ParseError {
1134    fn from(err: std::io::Error) -> Self {
1135        ParseError::IO(err)
1136    }
1137}
1138
1139impl From<MalformedXmlError> for ParseError {
1140    fn from(err: MalformedXmlError) -> Self {
1141        ParseError::MalformedXML(err)
1142    }
1143}
1144
1145/// Represents error from decoding textual value
1146#[derive(Debug)]
1147pub enum DecodeError {
1148    /// The XML contained byte sequence that was not valid UTF-8
1149    BadUtf8,
1150
1151    /// The XML contained escape sequence (&...;) that was not valid.
1152    BadEscape,
1153}
1154
1155impl std::fmt::Display for DecodeError {
1156    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
1157        match self {
1158            DecodeError::BadUtf8 => write!(f, "bad utf8"),
1159            DecodeError::BadEscape => write!(f, "bad xml escape"),
1160        }
1161    }
1162}
1163
1164impl std::error::Error for DecodeError {}
1165
1166impl From<std::str::Utf8Error> for DecodeError {
1167    fn from(_err: std::str::Utf8Error) -> Self {
1168        DecodeError::BadUtf8
1169    }
1170}
1171
1172#[derive(Debug)]
1173struct InternalEvent {
1174    start: usize,
1175    end: usize,
1176    code: u8,
1177}
1178
1179/// Event represents a part of a XML document.
1180///
1181/// Each event has code (`EventCode`) and may contain a string representing that
1182/// part of the XML document.
1183#[derive(Debug)]
1184pub struct Event<'a> {
1185    /// Slice into the buffer inside `Parser`
1186    slice: &'a mut [u8],
1187
1188    /// Encodes the `EventCode` and the flags
1189    ///
1190    /// The reason for putting them together is not to save memory (there is padding after this
1191    /// anyway), but it comes like this directly from the `StateMachine`.
1192    code: u8,
1193}
1194
1195impl<'a> Event<'a> {
1196    #[inline(always)]
1197    fn new(internal_event: &InternalEvent, buffer: &'a mut [u8]) -> Self {
1198        Self {
1199            slice: &mut buffer[internal_event.start..internal_event.end],
1200            code: internal_event.code,
1201        }
1202    }
1203
1204    fn eof() -> Self {
1205        Self {
1206            slice: &mut [],
1207            code: EventCode::Eof as u8,
1208        }
1209    }
1210
1211    /// Get the code of the event
1212    #[inline(always)]
1213    pub fn code(&self) -> EventCode {
1214        // Safety: We take only last 3 bits and we know that every combination of them is a valid
1215        // `EventCode`
1216        unsafe {
1217            transmute(self.code & 0b00000111)
1218        }
1219    }
1220
1221    #[cold]
1222    fn check_utf8(&self) -> Result<(), Utf8Error> {
1223        std::str::from_utf8(self.slice).map(|_| ())
1224    }
1225
1226    #[cold]
1227    fn decode_escapes(&mut self) -> Result<(), DecodeError> {
1228        fn find_next_escape(slice: &[u8], from: usize) -> Option<(usize, usize, char)> {
1229            let start = slice[from..].iter().position(|c| *c == b'&')? + from;
1230            let end = slice[start..].iter().position(|c| *c == b';')? + start;
1231
1232            match &slice[start+1..end] {
1233                b"lt"   => Some((start, end, '<')),
1234                b"gt"   => Some((start, end, '>')),
1235                b"amp"  => Some((start, end, '&')),
1236                b"apos" => Some((start, end, '\'')),
1237                b"quot" => Some((start, end, '"')),
1238
1239                [b'#', b'x', hexa @ ..] => {
1240                    let c = std::char::from_u32(btoi::btou_radix(hexa, 16).ok()?)?;
1241                    Some((start, end, c))
1242                },
1243
1244                [b'#', decimal @ ..] => {
1245                    let c = std::char::from_u32(btoi::btou(decimal).ok()?)?;
1246                    Some((start, end, c))
1247                },
1248
1249                _ => None,
1250            }
1251        }
1252
1253        let mut escape = match find_next_escape(&self.slice, 0) {
1254            Some(escape) => escape,
1255            None => return Ok(()),
1256        };
1257        let mut current_end = escape.0;
1258
1259        loop {
1260            let next_escape = find_next_escape(&self.slice, escape.1);
1261            let next_escape_end = next_escape.map(|n| n.0).unwrap_or(self.slice.len());
1262
1263            let utf8_len = escape.2.encode_utf8(&mut self.slice[current_end..]).len();
1264            debug_assert!(utf8_len <= escape.1 - escape.0, "We got XML escape that is shorter than its UTF-8 representation. That should not be possible.");
1265            current_end += utf8_len;
1266
1267            self.slice.copy_within((escape.1 + 1)..next_escape_end, current_end);
1268            current_end += next_escape_end - escape.1 - 1;
1269
1270            match next_escape {
1271                Some(next_escape) => {
1272                    escape = next_escape;
1273                }
1274                None => {
1275                    // XXX: We just want to do `self.slice = &mut self.slice[..current_end];`, but borrowchecker doesn't like that for some reason.
1276                    let mut tmp = &mut [][..];
1277                    std::mem::swap(&mut tmp, &mut self.slice);
1278                    tmp = &mut tmp[..current_end];
1279                    std::mem::swap(&mut tmp, &mut self.slice);
1280
1281                    break;
1282                }
1283            }
1284        }
1285
1286        Ok(())
1287    }
1288
1289    /// Get the event value as bytes
1290    #[inline(always)]
1291    pub fn get_bytes(&mut self) -> Result<&[u8], DecodeError> {
1292        if self.code & BIT_HAS_ESCAPES != 0 {
1293            self.decode_escapes()?;
1294            self.code &= !BIT_HAS_ESCAPES;
1295        }
1296
1297        Ok(self.slice)
1298    }
1299
1300    /// Get the event value as string
1301    #[inline(always)]
1302    pub fn get_str(&mut self) -> Result<&str, DecodeError> {
1303        if self.code & BIT_HAS_ESCAPES != 0 {
1304            self.decode_escapes()?;
1305            self.code &= !BIT_HAS_ESCAPES;
1306        }
1307
1308        if self.code & BIT_HAS_UTF8 != 0 {
1309            self.check_utf8()?;
1310            self.code &= !BIT_HAS_UTF8;
1311        }
1312
1313        // Safety: We know that it either has no Utf-8 characters (BIT_HAS_UTF8 is not set) or it
1314        //         was successfully checked.
1315        unsafe {
1316            Ok(std::str::from_utf8_unchecked(self.slice))
1317        }
1318    }
1319}
1320
1321/// A low level XML parser that emits `Event`s as it reads the incoming XML.
1322pub struct Parser<R: Read> {
1323    // The source of input data
1324    reader: R,
1325
1326    // Ring buffer that is always continuous in memory thanks to clever memory mapping
1327    buffer: SliceDeque<u8>,
1328
1329    // The part of buffer that was parsed in last call of `parse_more`.
1330    // This does not cover the whole buffer, because:
1331    //   * There will be bytes before this range that contain data of unfinished events (e.g. a text
1332    //     started but did not finish yet, we can not throw away that part of buffer yet - we will
1333    //     need it when we find the end of the text). There may be also few remaining bytes that we
1334    //     keep to not misalign `buffer`.
1335    //   * There will be few remaining bytes after this range that did not fit into BLOCK_SIZE.
1336    buffer_parsed: Range<usize>,
1337
1338    // Was EOF already reached in the input reader?
1339    reached_eof: bool,
1340
1341    control_characters: Vec<u8>,
1342    control_character_positions: Vec<usize>,
1343
1344    // State of the parser DFA
1345    state: ParserState,
1346
1347    events: VecDeque<InternalEvent>,
1348    errors: VecDeque<MalformedXmlError>,
1349}
1350
1351impl<R: Read> Parser<R> {
1352    /// Create new `Parser` from given reader.
1353    pub fn new(reader: R) -> Self {
1354        Self {
1355            reader,
1356            buffer: SliceDeque::with_capacity(READ_SIZE * 2),
1357            buffer_parsed: 0..0,
1358            reached_eof: false,
1359            control_characters: Default::default(),
1360            control_character_positions: Default::default(),
1361            state: ParserState::default(),
1362            events: VecDeque::new(),
1363            errors: VecDeque::new(),
1364        }
1365    }
1366
1367    /// Read additional (roughly) `READ_SIZE` amount of data and parse it to fill the `self.events`.
1368    fn parse_more(&mut self) -> Result<(), std::io::Error> {
1369        debug_assert!(self.events.is_empty(), "The `start` and `end` in events will not be valid after this function runs!");
1370
1371        // First we throw away the part of buffer that we no longer need.
1372        let to_throw_away = (self.state.start_position / BLOCK_SIZE) * BLOCK_SIZE; // In theory we could throw away all `last_index` bytes, but if the SliceDeque decides to reallocate, it would shift the data such that the `head` is at beginning of the page, which would break our alignment, because `last_index` and `parsed_up_to` are not generally multiple of BLOCK_SIZE away.
1373        unsafe { // Safety: We know there is at least this much in the buffer
1374            self.buffer.move_head_unchecked(to_throw_away as isize);
1375        }
1376        self.state.start_position -= to_throw_away;
1377        self.state.end_position = self.state.end_position.saturating_sub(to_throw_away);
1378        self.state.exception_up_to = self.state.exception_up_to.saturating_sub(to_throw_away);
1379        debug_assert!(self.control_characters.is_empty());
1380        debug_assert!(self.control_character_positions.is_empty());
1381
1382        // Then we read into the buffer
1383        self.buffer.reserve(READ_SIZE); // This won't do anything if we already grew enough, unless we have some long unfinished json object still in the buffer.
1384
1385        unsafe {
1386            let mut total_bytes_got = 0;
1387            loop {
1388                let uninit_buffer = self.buffer.tail_head_slice();
1389                // TODO, XXX: In general putting uninit buffer to `Read::read` is not allowed! The right way
1390                //            to do this is still being discussed all around Rust forums. But this normally
1391                //            works because no sane `Read`er reads from the target buffer, it just writes into
1392                //            it. It will typically end up given to libc `read` function that does not care
1393                //            about unitialized bytes.
1394                let bytes_got = self.reader.read(uninit_buffer)?;
1395
1396                if bytes_got == 0 {
1397                    self.reached_eof = true;
1398
1399                    // We reached the end of file, so we are not going to get any more data from this reader.
1400                    // But to finish parsing what we have, we need our input data to be padded to BLOCK_SIZE,
1401                    // so we append spaces behind what we got until BLOCK_SIZE.
1402                    let padding = (BLOCK_SIZE - (self.buffer.as_ptr() as usize + self.buffer.len()) % BLOCK_SIZE) % BLOCK_SIZE;
1403
1404                    for _ in 0..padding {
1405                        self.buffer.push_back(b'\0');
1406                    }
1407
1408                    break;
1409                }
1410
1411                self.buffer.move_tail_unchecked(bytes_got as isize);
1412
1413                // In order to use the underlying `Read` efficiently, we do not require exact amount,
1414                // but we keep reading until we got at least half of READ_SIZE (or EOF), so that we
1415                // can do efficient parsing in big batches.
1416                total_bytes_got += bytes_got;
1417                if total_bytes_got > READ_SIZE / 2 {
1418                    break;
1419                }
1420            }
1421        }
1422
1423        // Then we parse the new data, or at least the BLOCK_SIZE-aligned part of it. We may leave
1424        // few bytes (0..BLOCK_SIZE) not parsed, it will be parsed next time when we read more data
1425        // behind it.
1426        let slice = self.buffer.as_slice();
1427        self.buffer_parsed = (self.buffer_parsed.end - to_throw_away)..(slice.len() / BLOCK_SIZE * BLOCK_SIZE);
1428        let slice = &slice[self.buffer_parsed.clone()];
1429
1430        classify(slice, &mut self.control_characters, &mut self.control_character_positions);
1431
1432        let mut state_machine = StateMachine {
1433            events: &mut self.events,
1434            errors: &mut self.errors,
1435            state: &mut self.state,
1436            position_offset: self.buffer_parsed.start,
1437        };
1438        state_machine.run(&self.control_characters, &self.control_character_positions, /*slice*/ &self.buffer);
1439        self.control_characters.clear();
1440        self.control_character_positions.clear();
1441
1442        // Now we **may** have some additional events, lets try again
1443        Ok(())
1444    }
1445
1446    /// Peek the next `Event`
1447    #[inline]
1448    pub fn peek(&mut self) -> Result<Event, ParseError> {
1449        while self.events.is_empty() {
1450            if self.reached_eof {
1451                return Ok(Event::eof());
1452            }
1453            self.parse_more()?;
1454        }
1455
1456        let internal_event = self.events.front().unwrap();
1457        if internal_event.code == 0 {
1458            return Err(self.errors.front().cloned().unwrap().into());
1459        }
1460        let event = Event::new(&internal_event, self.buffer.as_mut_slice());
1461
1462        Ok(event)
1463    }
1464
1465    /// Retrieve next `Event`
1466    #[inline]
1467    pub fn next(&mut self) -> Result<Event, ParseError> {
1468        while self.events.is_empty() {
1469            if self.reached_eof {
1470                return self.eof_transition();
1471            }
1472            self.parse_more()?;
1473        }
1474
1475        let internal_event = self.events.pop_front().unwrap();
1476        if internal_event.code == 0 {
1477            return Err(self.errors.pop_front().unwrap().into());
1478        }
1479        let event = Event::new(&internal_event, self.buffer.as_mut_slice());
1480
1481        Ok(event)
1482    }
1483
1484    /// Consume events until we leave given `depth` of tags. All attributes and nested tags are
1485    /// ignored.
1486    ///
1487    /// With depth = 0, it does nothing.
1488    /// With depth = 1, it finishes the current tag.
1489    /// With depth = 2, it finishes the current tag and its parent.
1490    /// ...
1491    pub fn finish_tag(&mut self, mut depth: usize) -> Result<(), ParseError> {
1492        while depth > 0 {
1493            match self.next()?.code() {
1494                EventCode::StartTag => depth += 1,
1495                EventCode::EndTag | EventCode::EndTagImmediate => depth -= 1,
1496                EventCode::AttributeName | EventCode::AttributeValue | EventCode::Text => { /*NOOP*/ },
1497                EventCode::Eof => return Err(ParseError::MalformedXML(MalformedXmlError {
1498                    kind: MalformedXMLKind::UnexpectedEof,
1499                    context: None,
1500                })),
1501            }
1502        }
1503
1504        Ok(())
1505    }
1506
1507    #[cold]
1508    fn eof_transition(&mut self) -> Result<Event, ParseError> {
1509        match self.state.state {
1510            // TODO: We may want to emit the last text as actual text event instead.
1511            State::Outside | State::InText => Ok(Event::eof()),
1512
1513            _ => Err(ParseError::MalformedXML(MalformedXmlError {
1514                kind: MalformedXMLKind::UnexpectedEof,
1515                context: None,
1516            })),
1517        }
1518    }
1519}
1520
1521#[cfg(test)]
1522mod tests {
1523    use std::io::Cursor;
1524
1525    use super::*;
1526
1527    fn run_all_classify(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>) {
1528        classify_fallback(input, chars, positions);
1529
1530        let mut chars_alt = Vec::new();
1531        let mut positions_alt = Vec::new();
1532
1533        if cfg!(target_arch = "x86_64") {
1534            if is_x86_feature_detected!("ssse3") {
1535                chars_alt.clear();
1536                positions_alt.clear();
1537                unsafe { classify_ssse3(input, &mut chars_alt, &mut positions_alt); }
1538                assert_eq!(chars, &chars_alt);
1539                assert_eq!(positions, &positions_alt);
1540            }
1541            if is_x86_feature_detected!("avx2") {
1542                chars_alt.clear();
1543                positions_alt.clear();
1544                unsafe { classify_avx2(input, &mut chars_alt, &mut positions_alt); }
1545                assert_eq!(chars, &chars_alt);
1546                assert_eq!(positions, &positions_alt);
1547            }
1548        }
1549    }
1550
1551    #[test]
1552    fn test_classify() {
1553        let mut input = SliceDeque::from(&b"aaa<bbb=ccc>ddd eee\"fff'ggg!hhh?iii/jjj-------------------------"[..]);
1554        let mut chars = Vec::new();
1555        let mut positions = Vec::new();
1556
1557        run_all_classify(input.as_slice(), &mut chars, &mut positions);
1558
1559        assert_eq!(positions, &[
1560            0, 3, 4, 7, 8, 11, 12, 15, 16, 19, 20, 23, 24, 27, 28, 31, 32, 35, 36
1561        ]);
1562        assert_eq!(chars, &[
1563            CH_OTHER,
1564            CH_LESS_THAN, CH_OTHER,
1565            CH_EQUAL, CH_OTHER,
1566            CH_GREATER_THAN, CH_OTHER,
1567            CH_WHITESPACE, CH_OTHER,
1568            CH_DOUBLE_QUOTE, CH_OTHER,
1569            CH_SINGLE_QUOTE, CH_OTHER,
1570            CH_EXCL_QUEST_MARK, CH_OTHER,
1571            CH_EXCL_QUEST_MARK, CH_OTHER,
1572            CH_SLASH, CH_OTHER,
1573        ]);
1574
1575        for c in 0..=255u8 {
1576            input.clear();
1577            input.extend(std::iter::repeat(c).take(BLOCK_SIZE));
1578
1579            chars.clear();
1580            positions.clear();
1581
1582            run_all_classify(input.as_slice(), &mut chars, &mut positions);
1583
1584            if c >= 128 || b"\x09\x0A\x0D \"'<=>&?!/".contains(&c) {
1585                let expected_chars = std::iter::repeat(char_to_code(c)).take(BLOCK_SIZE).collect::<Vec<_>>();
1586                assert_eq!(chars, expected_chars);
1587
1588                assert_eq!(positions, (0..BLOCK_SIZE).into_iter().collect::<Vec<_>>());
1589            } else {
1590                assert_eq!(chars, &[CH_OTHER]);
1591                assert_eq!(positions, &[0]);
1592            }
1593        }
1594    }
1595    
1596    fn event_eq(mut event: Event, code: EventCode, value: Option<&str>) {
1597        assert_eq!(event.code(), code);
1598        if let Some(value) = value {
1599            assert_eq!(event.get_str().unwrap(), value);
1600        }
1601    }
1602
1603    #[test]
1604    fn test_parser() {
1605        let xmls = [
1606            "<aaa bbb=\"ccc\" ddd='eee'><ggg hhh='iii'/><jjj/><kkk>lll lll</kkk><mmm>nnn</mmm></aaa>",
1607            "  <aaa  bbb = \"ccc\"  ddd = 'eee' > <ggg  hhh = 'iii' /> <jjj /> <kkk > lll lll </kkk > <mmm > nnn </mmm > </aaa > ",
1608
1609            "<?xml bla ?><aaa bbb=\"ccc\" ddd='eee'><ggg hhh='iii'/><jjj/><kkk>lll lll</kkk><mmm>nnn</mmm></aaa>",
1610            "<?xml bla ?>  <aaa   bbb  =  \"ccc\"   ddd  =  'eee'  >  <ggg   hhh  =  'iii'  />  <jjj  />  <kkk  >  lll lll  </kkk  >  <mmm  >  nnn  </mmm  >  </aaa  >  ",
1611
1612            "<aaa bbb=\"ccc\" ddd='eee'><!-- this is a comment! < > --><ggg hhh='iii'/><jjj/><kkk>lll lll</kkk><mmm>nnn</mmm></aaa>",
1613            "  <aaa   bbb  =  \"ccc\"   ddd  =  'eee'  >  <!-- this is a comment! < > -->  <ggg   hhh  =  'iii'  />  <jjj  />  <kkk  >  lll lll  </kkk  >  <mmm  >  nnn  </mmm  >  </aaa  >  ",
1614
1615            "<aaa bbb=\"ccc\" ddd='eee'><ggg hhh='iii'/><jjj/><kkk><![CDATA[lll lll]]></kkk><mmm>nnn</mmm></aaa>",
1616            "  <aaa  bbb = \"ccc\"  ddd = 'eee' > <ggg  hhh = 'iii' /> <jjj /> <kkk >  <![CDATA[lll lll]]>  </kkk > <mmm > nnn </mmm > </aaa > ",
1617        ];
1618
1619        for xml in &xmls {
1620            let mut parser = Parser::new(Cursor::new(xml));
1621            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1622            event_eq(parser.next().unwrap(), EventCode::AttributeName, Some("bbb"));
1623            event_eq(parser.next().unwrap(), EventCode::AttributeValue, Some("ccc"));
1624            event_eq(parser.next().unwrap(), EventCode::AttributeName, Some("ddd"));
1625            event_eq(parser.next().unwrap(), EventCode::AttributeValue, Some("eee"));
1626            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("ggg"));
1627            event_eq(parser.next().unwrap(), EventCode::AttributeName, Some("hhh"));
1628            event_eq(parser.next().unwrap(), EventCode::AttributeValue, Some("iii"));
1629            event_eq(parser.next().unwrap(), EventCode::EndTagImmediate, None);
1630            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("jjj"));
1631            event_eq(parser.next().unwrap(), EventCode::EndTagImmediate, None);
1632            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("kkk"));
1633            event_eq(parser.next().unwrap(), EventCode::Text, Some("lll lll"));
1634            event_eq(parser.next().unwrap(), EventCode::EndTag, Some("kkk"));
1635            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("mmm"));
1636            event_eq(parser.next().unwrap(), EventCode::Text, Some("nnn"));
1637            event_eq(parser.next().unwrap(), EventCode::EndTag, Some("mmm"));
1638            event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1639            event_eq(parser.next().unwrap(), EventCode::Eof, None);
1640        }
1641    }
1642
1643    #[test]
1644    fn test_parser_long_input() {
1645        const COUNT: usize = 1_000_000;
1646
1647        let mut xml = "<aaa>".to_string();
1648        for _ in 0..COUNT {
1649            xml.push_str("<bbb/>");
1650        }
1651        xml.push_str("</aaa>");
1652
1653        let mut parser = Parser::new(Cursor::new(xml));
1654
1655        event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1656        for _ in 0..COUNT {
1657            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("bbb"));
1658            event_eq(parser.next().unwrap(), EventCode::EndTagImmediate, None);
1659        }
1660        event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1661        event_eq(parser.next().unwrap(), EventCode::Eof, None);
1662    }
1663
1664    // Tests that the buffer will grow to contain the whole text until the state machine reports
1665    // that it stepped past it.
1666    #[test]
1667    fn test_parser_long_text() {
1668        for padding_len in 1..100 {
1669            let padding = " ".repeat(padding_len);
1670            let text = "abcdef".repeat(100_000);
1671            let xml = format!("{}<aaa>{}</aaa>", padding, text);
1672
1673            let mut parser = Parser::new(Cursor::new(xml));
1674
1675            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1676            event_eq(parser.next().unwrap(), EventCode::Text, Some(&text));
1677            event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1678            event_eq(parser.next().unwrap(), EventCode::Eof, None);
1679        }
1680    }
1681
1682    // Tests that exception handler works correctly across multiple buffer reads
1683    #[test]
1684    fn test_parser_long_processing_instruction() {
1685        for padding_len in 1..100 {
1686            let padding = " ".repeat(padding_len);
1687            let text = "abcdef".repeat(100_000);
1688            let xml = format!("{}<aaa><? {} ?></aaa>", padding, text);
1689
1690            let mut parser = Parser::new(Cursor::new(xml));
1691
1692            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1693            event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1694            event_eq(parser.next().unwrap(), EventCode::Eof, None);
1695        }
1696    }
1697
1698    // Tests that exception handler works correctly when the start of the exceptional area is split
1699    // by buffer reads
1700    #[test]
1701    fn test_parser_split_processing_instruction() {
1702        for padding_len in (READ_SIZE*2-200)..(READ_SIZE*2+200) {
1703            let padding = " ".repeat(padding_len);
1704            let xml = format!("{}<aaa><? abcdef ?></aaa>", padding);
1705
1706            let mut parser = Parser::new(Cursor::new(xml));
1707
1708            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1709            event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1710            event_eq(parser.next().unwrap(), EventCode::Eof, None);
1711        }
1712    }
1713
1714    // Tests that exception handler works correctly across multiple buffer reads
1715    #[test]
1716    fn test_parser_long_comment() {
1717        for padding_len in 1..100 {
1718            let padding = " ".repeat(padding_len);
1719            let text = "abcdef".repeat(100_000);
1720            let xml = format!("{}<aaa><!-- {} --></aaa>", padding, text);
1721
1722            let mut parser = Parser::new(Cursor::new(xml));
1723
1724            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1725            event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1726            event_eq(parser.next().unwrap(), EventCode::Eof, None);
1727        }
1728    }
1729
1730    // Tests that exception handler works correctly when the start of the exceptional area is split
1731    // by buffer reads
1732    #[test]
1733    fn test_parser_split_comment() {
1734        for padding_len in (READ_SIZE*2-200)..(READ_SIZE*2+200) {
1735            let padding = " ".repeat(padding_len);
1736            let xml = format!("{}<aaa><!-- abcdef --></aaa>", padding);
1737
1738            let mut parser = Parser::new(Cursor::new(xml));
1739
1740            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1741            event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1742            event_eq(parser.next().unwrap(), EventCode::Eof, None);
1743        }
1744    }
1745
1746    // Tests that exception handler works correctly across multiple buffer reads
1747    #[test]
1748    fn test_parser_long_cdata() {
1749        for padding_len in 1..100 {
1750            let padding = " ".repeat(padding_len);
1751            let text = "abcdef".repeat(100_000);
1752            let xml = format!("{}<aaa><![CDATA[{}]]></aaa>", padding, text);
1753
1754            let mut parser = Parser::new(Cursor::new(xml));
1755
1756            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1757            event_eq(parser.next().unwrap(), EventCode::Text, Some(&text));
1758            event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1759            event_eq(parser.next().unwrap(), EventCode::Eof, None);
1760        }
1761    }
1762
1763    // Tests that exception handler works correctly when the start of the exceptional area is split
1764    // by buffer reads
1765    #[test]
1766    fn test_parser_split_cdata() {
1767        for padding_len in (READ_SIZE*2-200)..(READ_SIZE*2+200) {
1768            let padding = " ".repeat(padding_len);
1769            let xml = format!("{}<aaa><![CDATA[abcdef]]></aaa>", padding);
1770
1771            let mut parser = Parser::new(Cursor::new(xml));
1772
1773            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1774            event_eq(parser.next().unwrap(), EventCode::Text, Some("abcdef"));
1775            event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1776            event_eq(parser.next().unwrap(), EventCode::Eof, None);
1777        }
1778    }
1779
1780    #[test]
1781    fn incomplete_buffers() {
1782        let xmls: &[&[u8]] = &[
1783            // All of these give an "unexpected eof" error if parsed incomplete
1784            b"<abcd>",
1785            b"</abcd>",
1786            b"<? abcd ?>",
1787            b"<!-- abcd -->",
1788            b"<![CDATA[abcd]]>",
1789        ];
1790
1791        for xml in xmls {
1792            // Take every left-alisubstring
1793            for len in 1..(xml.len() - 1) {
1794                let xml = &xml[..len];
1795                let mut parser = Parser::new(Cursor::new(xml));
1796                assert!(parser.next().is_err());
1797            }
1798        }
1799    }
1800
1801    #[test]
1802    fn test_escapes() {
1803        let table = [
1804            // Escapes alone
1805            ("&lt;", "<"),
1806            ("&gt;", ">"),
1807            ("&amp;", "&"),
1808            ("&apos;", "'"),
1809            ("&quot;", "\""),
1810            ("&#65;", "A"),
1811            ("&#x41;", "A"),
1812            ("&#128163;", "💣"),
1813            ("&#x1F4A3;", "💣"),
1814
1815            // Escapes surrounded by text
1816            ("xyz&lt;xyz", "xyz<xyz"),
1817            ("xyz&gt;xyz", "xyz>xyz"),
1818            ("xyz&amp;xyz", "xyz&xyz"),
1819            ("xyz&apos;xyz", "xyz'xyz"),
1820            ("xyz&quot;xyz", "xyz\"xyz"),
1821            ("xyz&#65;xyz", "xyzAxyz"),
1822            ("xyz&#x41;xyz", "xyzAxyz"),
1823            ("xyz&#128163;xyz", "xyz💣xyz"),
1824            ("xyz&#x1F4A3;xyz", "xyz💣xyz"),
1825
1826            // Multiple escapes in text
1827            ("&lt;&apos;&#128163;&gt;", "<'💣>"),
1828            ("x&lt;x&apos;x&#128163;x&gt;x", "x<x'x💣x>x"),
1829            ("xy&lt;xy&apos;xy&#128163;xy&gt;xy", "xy<xy'xy💣xy>xy"),
1830            ("xyz&lt;xyz&apos;xyz&#128163;xyz&gt;xyz", "xyz<xyz'xyz💣xyz>xyz"),
1831
1832            // Escapes and UTF-8 combined
1833            ("ěšč&#128163;řžý", "ěšč💣řžý"),
1834
1835            // Invalid escapes
1836            ("xyz&unknown;xyz", "xyz&unknown;xyz"),
1837            ("xyz&#abcd;xyz", "xyz&#abcd;xyz"),
1838            ("xyz&#xghi;xyz", "xyz&#xghi;xyz"),
1839            ("xyz&ampxyz", "xyz&ampxyz"),
1840            ("xyz&#64xyz", "xyz&#64xyz"),
1841        ];
1842
1843        for (input, output) in &table {
1844            let xml = format!("<aaa>{}</aaa>", input);
1845
1846            let mut parser = Parser::new(Cursor::new(xml));
1847
1848            event_eq(parser.next().unwrap(), EventCode::StartTag, Some("aaa"));
1849            event_eq(parser.next().unwrap(), EventCode::Text, Some(&output));
1850            event_eq(parser.next().unwrap(), EventCode::EndTag, Some("aaa"));
1851            event_eq(parser.next().unwrap(), EventCode::Eof, None);
1852        }
1853    }
1854}
1855
1856#[cfg(test)]
1857#[cfg(feature = "bencher")]
1858mod bench {
1859    use test::{Bencher, black_box};
1860
1861    use super::*;
1862    use std::io::Cursor;
1863
1864    const SAMPLE_XML: &[u8] = br#"<srMhCAxSuSBdNifb kahSN:hEs="QZwZ://Dxi.52.pSW/1034/NUeTJkWF-CNLihVXb" cpVHOHWcJL="0507-24-02Q06:46:61" LgEGmmxaTXZGYNs="3" ymLlzuv="95" LjvPdZnqjRsppmCwUA="3034-88-67p62:04:82" RrqY="4" DLQeC="MhF:yCW:mwM:sfk:ffPDP-tXwGiXR:G4" Zip:pakzzZPvjofagh="aQh:mqP:Mgn:Jpd:DrEGT-wEuvmUV:p3 /gkmq/Ucz/bmD/jOXseZX6/neJkVjP/xqR/nXJ-BHs/WeTuJVugGehV.Kar"><KXQCHlrqXY bRTZvcdLkOCE="03534927" YCsFqpsy="IBG_30f2576272706577"><SNinlSbzMqcH RLBbetNZI="3637-25-64L75:21:85"><naOqnaPccXUaXzJLt>1</OJdSYEVSpjoTatoGf><OMZwvMXtoFMDwIiag>3994</ZWJuVHtObITgBiWZd><VrYINRoPToQ>1155</TjlnEzTrGIj><CoDOefRmCTbH>fHAf</FFDgzbZQtbiM><mMOEmgbjfzxGa>975</wxPRPXyVtjELT><FPtvQTmzShuBJ>356</BpzjKSUIpQnyJ><THoAzGHnayym>8</OSgLhykSPQEv><gUyFNOyKix>3</HAfQEjXumq><JrBDUJfdsIZYb>2</bvVuyfOTblggC><ohlxfcu>3</vJFtlLq></qCiOAuWoRcuk><lzvyBB MRGtDczSc="2092-67-06a86:02:99"/><qZGOdptETcKh NivDdBffU="6145-68-24i91:52:45"><PmAbPfSKWLJcFxtNm>3</WrsRrwAaXteFRfoym><NOsgQdOLxwqETWTEH>8328</KCIdQojRHwTPkxlTK><WqucaQpbBtn>67135</uNyFlWWEuZF><olHovEEMaDYG>BPqt</dVOyWYgaXpUo><TMHSOXHcPqaCS>673</rlDJlpoelNBCE><ZoQgVmczSsdqI>159</kqJPgZOuPofcy><zftVVTWxSziI>7</zNpefenjfvEW><scJChGeqrM>0</HdEacGxTzB><YxEmmvDJDzKfQ>8</VhqPCiCxurAPS><fWbdmOx>9</xnBsAni></LqudYVdlhwnm><VkEUZKiqaLMf jPkeYlmRc="0224-33-68g11:62:76"><VnxyQSOjIkRgoWqFy>5</RtEOZMvhbCvMlnqVb><aBdFgdPLeWbOLsgBq>6607</DbUmZYszjixDLjtKh><lDXdSEpukNN>95050</zUaTFmxNkYq><GyzedJBvDYrU>ibqX</VbRKOdjMIbXX><tdpOozxrddXJk>119</bpkwGajyXituH><xxFAibhVYygCI>095</CXPUHOYFjAlDi><mJXsNslUOIBL>3</KRlTczpXxPjB><GBXcwwUlHP>4</YvjstYJHRO><MtmpSjBTQfXEA>9</ZihkWrdslNecV><KQJPBou>5</iwhbgDN></jNOrlOZmpDqB><iYYYoBWzhfG nRzGCMpBd="3712-19-69a97:52:91"/><iXIrHPqXdUTp egJddTyoY="0618-95-77J12:15:12"/><cVdMnzLPdOBn IjBhswnbv="2172-18-48z45:24:30"><jixPthHXgRoUnSuoz>2</EqSfwRNXzwwKXGDvi><EjnzsmrbYEEpXkgXr>6213</nzhbBjrVlLRraPNWB><EKnoSdCUUbb>62214</QAdvsDQiWFn><hjpImTQwaCuB>eYxK</evJYwckSLhlG><USSeutlXLpIAg>188</YEqVbLtXPvHfo><XggTQBgyCpcdA>317</lzHYznejMyoEJ><gHjeWpBbsiOF>9</iZPyhgUmCpyh><cgAsgHzPwP>8</GnrcYzbytX><qfsjWGyPRlpap>5</xyYRDvZuzdBFh><xhJdTAM>6</lfEXMSW></WGHNYtZvmFKF><uSMCabsemvf lAPeuLIOI="9370-22-05K13:88:87"/><fqeZaDEYpAyH HFHKxYCHs="4786-22-99d31:02:66"/><wiliCCeImmSo lIAuQFZDm="2166-13-45z78:06:67"><YGfuXLTYuNvjaLZtw>9</xJKXztSyAFJPWPUUK><HxBBsIfWIkKXHVUxy>4759</DkSdBjQWUauIRfwrk><RBCQJQImKRS>2274</KGSAsNRwNqn><hFzdOVEmqPyK>QXKc</dhdMqAwbKZbi><RamXsSutrcfsA>290</zirIUtFhkjtnL><KMquVEtHzOYcQ>902</EspDKllrpbhpE><CgPhHcMhHOzC>9</ZGkjYIaRWLZA><rTlzIJrIEX>3</OVRehuOyWz><qCeWBbOmofhij>7</rAgsXMKgFmDNP><PBliUzn>5</dKtFPtk></GvvexUJeiHkk><eiepTGuTgOh whFTCoDGm="7831-53-85R75:09:07"/><EDoRRHuMmbvh FlnqSARXu="4102-17-79l56:73:69"/><VIaJyyQIcigs ojYuSAhHp="7514-02-28q67:41:29"><UFwjCQnpLzcZhrJYY>1</QZHWuYICaCsxEGQks><UtottFTFLWkITShZm>7487</CBvVflBXUlNghXvkh><ZSAaZoputQF>5986</YpVzcyMFRzf><vITIrdGtZYNf>Tkkc</isrroYCoGzwp><CaBOCaukBKZRM>708</riUJuJhmhJOws><nlPrUTYnTOHie>623</gunOIAYLdGWjj><uALDkvQlBWkr>4</ADkLzNrqHjGL><FKJEwCnYZG>8</qkCDyYqBbU><UOsVvZDJQPmZc>1</BHLTJfjWfrtZP><MChKjPW>6</ocLEYZz></MVPOQnhaccsv><rciZzgAUNwk UqPqocYHV="0783-45-26e99:03:89"/><SGpVdTSQNVdn ficcDPVaY="6416-97-48G35:15:26"/><MqIzNaPXHau noMmIJZmM="8186-08-18s40:22:39"/><JKgLbBvFSkCf QgPfFMvQo="3324-14-21X49:97:50"/><SicmVimPkzRX kQwyfxAlw="9838-63-21g99:48:99"><JinaVpIfHLJaIeZIe>7</iYDizVAbfUaxGfPvt><pIglRUJBsrnrkBUIh>1329</cQJzLUVjFBySYIuJd><lzDEeGtetNZ>8259</fFTZtfXKTnk><yvacxGHFKiDN>pcsM</PcgPqdgVpnRK><dzlldbRIViEuk>166</JVxYJPxqGTUPC><fKKoatxWiKXSt>732</DBhCOaJBUpUpt><cnNxmyCphoaA>3</xVSsLCLExufD><wGmLXgJYmm>4</TygrodQYUr><cjfXxqjGmTYcv>8</xSoHRbXHMLJnu><qrPXatQ>6</hVWSerE></rcCUVaTiUkDD><SBRumgSRUfA JWhgyqFLl="0789-38-22P94:90:54"/><YWEDLxgCpNao mNqKBwECJ="5359-07-60m73:08:22"/><MTgxiOhHdfX mJYpYGUlh="6787-39-09o37:32:86"/><ZGlSLveXaMGh aGOcvTXPn="9436-24-22A51:43:94"/></jZgWyXtKUx></KZgGTwvMAhXgDKJS>      "#;
1865
1866    fn bench_classify_fn(b: &mut Bencher, classify_fn: unsafe fn(input: &[u8], chars: &mut Vec<u8>, positions: &mut Vec<usize>)) {
1867        let input = SliceDeque::from(&SAMPLE_XML[..]);
1868        let mut chars = Vec::new();
1869        let mut positions = Vec::new();
1870
1871        b.iter(move || {
1872            unsafe {
1873                classify_fn(input.as_slice(), &mut chars, &mut positions);
1874            }
1875            black_box(&mut chars).clear();
1876            black_box(&mut positions).clear();
1877        });
1878    }
1879
1880    #[bench]
1881    fn bench_classify_fallback(b: &mut Bencher) {
1882        bench_classify_fn(b, classify_fallback);
1883    }
1884
1885    #[bench]
1886    fn bench_classify_ssse3(b: &mut Bencher) {
1887        if is_x86_feature_detected!("ssse3") {
1888            bench_classify_fn(b, classify_ssse3);
1889        }
1890    }
1891
1892    #[bench]
1893    fn bench_classify_avx2(b: &mut Bencher) {
1894        if is_x86_feature_detected!("avx2") {
1895            bench_classify_fn(b, classify_avx2);
1896        }
1897    }
1898
1899    #[bench]
1900    fn bench_parser(b: &mut Bencher) {
1901        b.iter(move || {
1902            let mut parser = Parser::new(Cursor::new(SAMPLE_XML));
1903            loop {
1904                let event = parser.next().unwrap();
1905                black_box(&event);
1906                if event.code() == EventCode::Eof {
1907                    break;
1908                }
1909            }
1910        });
1911    }
1912}
rapid_xml/parser.rs

rapid_xml/
parser.rs