encoding/codec/
utf_8.rs

1// This is a part of encoding-next.
2// Copyright (c) 2013-2015, Kang Seonghoon.
3// See README.md and LICENSE.txt for details.
4//
5// Portions Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
6//
7// Permission is hereby granted, free of charge, to any person obtaining a copy
8// of this software and associated documentation files (the "Software"), to deal
9// in the Software without restriction, including without limitation the rights
10// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11// copies of the Software, and to permit persons to whom the Software is
12// furnished to do so, subject to the following conditions:
13//
14// The above copyright notice and this permission notice shall be included in
15// all copies or substantial portions of the Software.
16//
17// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23// SOFTWARE.
24
25//! UTF-8, the universal encoding.
26
27use crate::types::*;
28use std::convert::Into;
29use std::str;
30
31/**
32 * UTF-8 (UCS Transformation Format, 8-bit).
33 *
34 * This is a Unicode encoding compatible to ASCII (ISO/IEC 646:US)
35 * and able to represent all Unicode codepoints uniquely and unambiguously.
36 * It has a variable-length design,
37 * where one codepoint may use 1 (up to U+007F), 2 (up to U+07FF), 3 (up to U+FFFF)
38 * and 4 bytes (up to U+10FFFF) depending on its value.
39 * The first byte of the sequence is distinct from other "continuation" bytes of the sequence
40 * making UTF-8 self-synchronizable and easy to handle.
41 * It has a fixed endianness, and can be lexicographically sorted by codepoints.
42 *
43 * The UTF-8 scanner used by this module is heavily based on Bjoern Hoehrmann's
44 * [Flexible and Economical UTF-8 Decoder](http://bjoern.hoehrmann.de/utf-8/decoder/dfa/).
45 */
46#[derive(Clone, Copy)]
47pub struct UTF8Encoding;
48
49impl Encoding for UTF8Encoding {
50    fn name(&self) -> &'static str {
51        "utf-8"
52    }
53    fn whatwg_name(&self) -> Option<&'static str> {
54        Some("utf-8")
55    }
56    fn raw_encoder(&self) -> Box<dyn RawEncoder> {
57        UTF8Encoder::new()
58    }
59    fn raw_decoder(&self) -> Box<dyn RawDecoder> {
60        UTF8Decoder::new()
61    }
62}
63
64/// An encoder for UTF-8.
65#[derive(Clone, Copy)]
66pub struct UTF8Encoder;
67
68impl UTF8Encoder {
69    #[allow(clippy::new_ret_no_self)]
70    pub fn new() -> Box<dyn RawEncoder> {
71        Box::new(UTF8Encoder)
72    }
73}
74
75impl RawEncoder for UTF8Encoder {
76    fn from_self(&self) -> Box<dyn RawEncoder> {
77        UTF8Encoder::new()
78    }
79    fn is_ascii_compatible(&self) -> bool {
80        true
81    }
82
83    fn raw_feed(
84        &mut self,
85        input: &str,
86        output: &mut dyn ByteWriter,
87    ) -> (usize, Option<CodecError>) {
88        let input: &[u8] = input.as_bytes();
89        assert!(str::from_utf8(input).is_ok());
90        output.write_bytes(input);
91        (input.len(), None)
92    }
93
94    fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
95        None
96    }
97}
98
99/// A decoder for UTF-8.
100#[derive(Clone, Copy)]
101pub struct UTF8Decoder {
102    queuelen: usize,
103    queue: [u8; 4],
104    state: u8,
105}
106
107impl UTF8Decoder {
108    #[allow(clippy::new_ret_no_self)]
109    pub fn new() -> Box<dyn RawDecoder> {
110        Box::new(UTF8Decoder {
111            queuelen: 0,
112            queue: [0; 4],
113            state: INITIAL_STATE,
114        })
115    }
116}
117
118static CHAR_CATEGORY: [u8; 256] = [
119    //  0 (00-7F): one byte sequence
120    //  1 (80-8F): continuation byte
121    //  2 (C2-DF): start of two byte sequence
122    //  3 (E1-EC,EE-EF): start of three byte sequence, next byte unrestricted
123    //  4 (ED): start of three byte sequence, next byte restricted to non-surrogates (80-9F)
124    //  5 (F4): start of four byte sequence, next byte restricted to 0+10FFFF (80-8F)
125    //  6 (F1-F3): start of four byte sequence, next byte unrestricted
126    //  7 (A0-BF): continuation byte
127    //  8 (C0-C1,F5-FF): invalid (overlong or out-of-range) start of multi byte sequences
128    //  9 (90-9F): continuation byte
129    // 10 (E0): start of three byte sequence, next byte restricted to non-overlong (A0-BF)
130    // 11 (F0): start of four byte sequence, next byte restricted to non-overlong (90-BF)
131    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
132    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
134    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
135    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
136    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
137    8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
138    10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
139    8,
140];
141
142static STATE_TRANSITIONS: [u8; 110] = [
143    0, 98, 12, 24, 48, 84, 72, 98, 98, 98, 36, 60, //  0: '??
144    86, 0, 86, 86, 86, 86, 86, 0, 86, 0, 86, 86, // 12: .. 'cc
145    86, 12, 86, 86, 86, 86, 86, 12, 86, 12, 86, 86, // 24: .. 'cc cc
146    86, 86, 86, 86, 86, 86, 86, 12, 86, 86, 86, 86, // 36: .. 'cc(A0-BF) cc
147    86, 12, 86, 86, 86, 86, 86, 86, 86, 12, 86, 86, // 48: .. 'cc(80-9F) cc
148    86, 86, 86, 86, 86, 86, 86, 24, 86, 24, 86, 86, // 60: .. 'cc(90-BF) cc cc
149    86, 24, 86, 86, 86, 86, 86, 24, 86, 24, 86, 86, // 72: .. 'cc cc cc
150    86, 24, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, // 84: .. 'cc(80-8F) cc cc
151    // 86,86,86,86,86,86,86,86,86,86,86,86, // 86: .. xx '..
152    98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, // 98: xx '..
153];
154
155static INITIAL_STATE: u8 = 0;
156static ACCEPT_STATE: u8 = 0;
157static REJECT_STATE: u8 = 98;
158static REJECT_STATE_WITH_BACKUP: u8 = 86;
159
160macro_rules! is_reject_state(($state:expr) => ($state >= REJECT_STATE_WITH_BACKUP));
161macro_rules! next_state(($state:expr, $ch:expr) => (
162    STATE_TRANSITIONS[($state + CHAR_CATEGORY[$ch as usize]) as usize]
163));
164
165impl RawDecoder for UTF8Decoder {
166    fn from_self(&self) -> Box<dyn RawDecoder> {
167        UTF8Decoder::new()
168    }
169    fn is_ascii_compatible(&self) -> bool {
170        true
171    }
172
173    fn raw_feed(
174        &mut self,
175        input: &[u8],
176        output: &mut dyn StringWriter,
177    ) -> (usize, Option<CodecError>) {
178        output.writer_hint(input.len());
179
180        fn write_bytes(output: &mut dyn StringWriter, bytes: &[u8]) {
181            output.write_str(unsafe { std::str::from_utf8_unchecked(bytes) });
182        }
183
184        let mut state = self.state;
185        let mut processed = 0;
186        let mut offset = 0;
187
188        // optimization: if we are in the initial state, quickly skip to the first non-MSB-set byte.
189        if state == INITIAL_STATE {
190            let first_msb = input
191                .iter()
192                .position(|&ch| ch >= 0x80)
193                .unwrap_or(input.len());
194            offset += first_msb;
195            processed += first_msb;
196        }
197
198        for (i, &ch) in input[offset..].iter().enumerate() {
199            state = next_state!(state, ch);
200            if state == ACCEPT_STATE {
201                processed = i + offset + 1;
202            } else if is_reject_state!(state) {
203                let upto = if state == REJECT_STATE {
204                    i + offset + 1
205                } else {
206                    i + offset
207                };
208                self.state = INITIAL_STATE;
209                if processed > 0 && self.queuelen > 0 {
210                    // flush `queue` outside the problem
211                    write_bytes(output, &self.queue[0..self.queuelen]);
212                }
213                self.queuelen = 0;
214                write_bytes(output, &input[0..processed]);
215                return (
216                    processed,
217                    Some(CodecError {
218                        upto: upto as isize,
219                        cause: "invalid sequence".into(),
220                    }),
221                );
222            }
223        }
224
225        self.state = state;
226        if processed > 0 && self.queuelen > 0 {
227            // flush `queue`
228            write_bytes(output, &self.queue[0..self.queuelen]);
229            self.queuelen = 0;
230        }
231        write_bytes(output, &input[0..processed]);
232        if processed < input.len() {
233            let morequeuelen = input.len() - processed;
234            for i in 0..morequeuelen {
235                self.queue[self.queuelen + i] = input[processed + i];
236            }
237            self.queuelen += morequeuelen;
238        }
239        (processed, None)
240    }
241
242    fn raw_finish(&mut self, _output: &mut dyn StringWriter) -> Option<CodecError> {
243        let state = self.state;
244        let queuelen = self.queuelen;
245        self.state = INITIAL_STATE;
246        self.queuelen = 0;
247        if state != ACCEPT_STATE {
248            Some(CodecError {
249                upto: 0,
250                cause: "incomplete sequence".into(),
251            })
252        } else {
253            assert!(queuelen == 0);
254            None
255        }
256    }
257}
258
259/// Almost equivalent to `std::str::from_utf8`.
260/// This function is provided for the fair benchmark against the stdlib's UTF-8 conversion
261/// functions, as encoding-next always allocates a new string.
262pub fn from_utf8(input: &[u8]) -> Option<&str> {
263    let mut iter = input.iter();
264    let mut state;
265
266    macro_rules! return_as_whole(() => (return Some(unsafe {std::str::from_utf8_unchecked(input)})));
267
268    // optimization: if we are in the initial state, quickly skip to the first non-MSB-set byte.
269    loop {
270        match iter.next() {
271            Some(&ch) if ch < 0x80 => {}
272            Some(&ch) => {
273                state = next_state!(INITIAL_STATE, ch);
274                break;
275            }
276            None => {
277                return_as_whole!();
278            }
279        }
280    }
281
282    for &ch in iter {
283        state = next_state!(state, ch);
284        if is_reject_state!(state) {
285            return None;
286        }
287    }
288    if state != ACCEPT_STATE {
289        return None;
290    }
291    return_as_whole!();
292}
293
294#[cfg(test)]
295mod tests {
296    // portions of these tests are adopted from Markus Kuhn's UTF-8 decoder capability and
297    // stress test: <http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt>.
298
299    use super::{from_utf8, UTF8Encoding};
300    use crate::testutils;
301    use crate::types::*;
302    use std::str;
303
304    #[test]
305    fn test_valid() {
306        // one byte
307        let mut d = UTF8Encoding.raw_decoder();
308        assert_feed_ok!(d, [0x41], [], "A");
309        assert_feed_ok!(d, [0x42, 0x43], [], "BC");
310        assert_feed_ok!(d, [], [], "");
311        assert_feed_ok!(d, [0x44, 0x45, 0x46], [], "DEF");
312        assert_finish_ok!(d, "");
313
314        // two bytes
315        let mut d = UTF8Encoding.raw_decoder();
316        assert_feed_ok!(d, [0xc2, 0xa2], [], "\u{a2}");
317        assert_feed_ok!(d, [0xc2, 0xac, 0xc2, 0xa9], [], "\u{ac}\u{0a9}");
318        assert_feed_ok!(d, [], [], "");
319        assert_feed_ok!(
320            d,
321            [
322                0xd5, 0xa1, 0xd5, 0xb5, 0xd5, 0xa2, 0xd5, 0xb8, 0xd6, 0x82, 0xd5, 0xa2, 0xd5, 0xa5,
323                0xd5, 0xb6
324            ],
325            [],
326            "\u{561}\u{0575}\u{562}\u{578}\u{582}\u{562}\u{565}\u{576}"
327        );
328        assert_finish_ok!(d, "");
329
330        // three bytes
331        let mut d = UTF8Encoding.raw_decoder();
332        assert_feed_ok!(d, [0xed, 0x92, 0x89], [], "\u{d489}");
333        assert_feed_ok!(
334            d,
335            [0xe6, 0xbc, 0xa2, 0xe5, 0xad, 0x97],
336            [],
337            "\u{6f22}\u{5b57}"
338        );
339        assert_feed_ok!(d, [], [], "");
340        assert_feed_ok!(
341            d,
342            [0xc9, 0x99, 0xc9, 0x94, 0xc9, 0x90],
343            [],
344            "\u{259}\u{0254}\u{250}"
345        );
346        assert_finish_ok!(d, "");
347
348        // four bytes
349        let mut d = UTF8Encoding.raw_decoder();
350        assert_feed_ok!(d, [0xf0, 0x90, 0x82, 0x82], [], "\u{10082}");
351        assert_feed_ok!(d, [], [], "");
352        assert_finish_ok!(d, "");
353
354        // we don't test encoders as it is largely a no-op.
355    }
356
357    #[test]
358    fn test_valid_boundary() {
359        let mut d = UTF8Encoding.raw_decoder();
360        assert_feed_ok!(d, [0x00], [], "\x00");
361        assert_finish_ok!(d, "");
362
363        let mut d = UTF8Encoding.raw_decoder();
364        assert_feed_ok!(d, [0x7f], [], "\x7f");
365        assert_finish_ok!(d, "");
366
367        let mut d = UTF8Encoding.raw_decoder();
368        assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}");
369        assert_finish_ok!(d, "");
370
371        let mut d = UTF8Encoding.raw_decoder();
372        assert_feed_ok!(d, [0xdf, 0xbf], [], "\u{7ff}");
373        assert_finish_ok!(d, "");
374
375        let mut d = UTF8Encoding.raw_decoder();
376        assert_feed_ok!(d, [0xe0, 0xa0, 0x80], [], "\u{800}");
377        assert_finish_ok!(d, "");
378
379        let mut d = UTF8Encoding.raw_decoder();
380        assert_feed_ok!(d, [0xed, 0x9f, 0xbf], [], "\u{d7ff}");
381        assert_finish_ok!(d, "");
382
383        let mut d = UTF8Encoding.raw_decoder();
384        assert_feed_ok!(d, [0xee, 0x80, 0x80], [], "\u{e000}");
385        assert_finish_ok!(d, "");
386
387        let mut d = UTF8Encoding.raw_decoder();
388        assert_feed_ok!(d, [0xef, 0xbf, 0xbf], [], "\u{ffff}");
389        assert_finish_ok!(d, "");
390
391        let mut d = UTF8Encoding.raw_decoder();
392        assert_feed_ok!(d, [0xf0, 0x90, 0x80, 0x80], [], "\u{10000}");
393        assert_finish_ok!(d, "");
394
395        let mut d = UTF8Encoding.raw_decoder();
396        assert_feed_ok!(d, [0xf4, 0x8f, 0xbf, 0xbf], [], "\u{10ffff}");
397        assert_finish_ok!(d, "");
398    }
399
400    #[test]
401    fn test_valid_partial() {
402        let mut d = UTF8Encoding.raw_decoder();
403        assert_feed_ok!(d, [], [0xf0], "");
404        assert_feed_ok!(d, [], [0x90], "");
405        assert_feed_ok!(d, [], [0x82], "");
406        assert_feed_ok!(d, [0x82], [0xed], "\u{10082}");
407        assert_feed_ok!(d, [0x92, 0x89], [], "\u{d489}");
408        assert_finish_ok!(d, "");
409
410        let mut d = UTF8Encoding.raw_decoder();
411        assert_feed_ok!(d, [], [0xc2], "");
412        assert_feed_ok!(d, [0xa9, 0x20], [], "\u{a9}\u{020}");
413        assert_finish_ok!(d, "");
414    }
415
416    #[test]
417    fn test_invalid_continuation() {
418        for c in 0x80..0xc0 {
419            let mut d = UTF8Encoding.raw_decoder();
420            assert_feed_err!(d, [], [c], [], "");
421            assert_finish_ok!(d, "");
422
423            let mut d = UTF8Encoding.raw_decoder();
424            assert_feed_err!(d, [], [c], [c], "");
425            assert_finish_ok!(d, "");
426
427            let mut d = UTF8Encoding.raw_decoder();
428            assert_feed_err!(d, [], [c], [c, c], "");
429            assert_finish_ok!(d, "");
430        }
431    }
432
433    #[test]
434    fn test_invalid_surrogate() {
435        // surrogates should fail at the second byte.
436
437        let mut d = UTF8Encoding.raw_decoder();
438        assert_feed_err!(d, [], [0xed], [0xa0, 0x80], "");
439        assert_finish_ok!(d, "");
440
441        let mut d = UTF8Encoding.raw_decoder();
442        assert_feed_err!(d, [], [0xed], [0xad, 0xbf], "");
443        assert_finish_ok!(d, "");
444
445        let mut d = UTF8Encoding.raw_decoder();
446        assert_feed_err!(d, [], [0xed], [0xae, 0x80], "");
447        assert_finish_ok!(d, "");
448
449        let mut d = UTF8Encoding.raw_decoder();
450        assert_feed_err!(d, [], [0xed], [0xaf, 0xbf], "");
451        assert_finish_ok!(d, "");
452
453        let mut d = UTF8Encoding.raw_decoder();
454        assert_feed_err!(d, [], [0xed], [0xb0, 0x80], "");
455        assert_finish_ok!(d, "");
456
457        let mut d = UTF8Encoding.raw_decoder();
458        assert_feed_err!(d, [], [0xed], [0xbe, 0x80], "");
459        assert_finish_ok!(d, "");
460
461        let mut d = UTF8Encoding.raw_decoder();
462        assert_feed_err!(d, [], [0xed], [0xbf, 0xbf], "");
463        assert_finish_ok!(d, "");
464    }
465
466    #[test]
467    fn test_invalid_boundary() {
468        // as with surrogates, should fail at the second byte.
469        let mut d = UTF8Encoding.raw_decoder();
470        assert_feed_err!(d, [], [0xf4], [0x90, 0x90, 0x90], ""); // U+110000
471        assert_finish_ok!(d, "");
472    }
473
474    #[test]
475    fn test_invalid_start_immediate_test_finish() {
476        for c in 0xf5..0x100 {
477            let c = c as u8;
478            let mut d = UTF8Encoding.raw_decoder();
479            assert_feed_err!(d, [], [c], [], "");
480            assert_finish_ok!(d, "");
481        }
482    }
483
484    #[test]
485    fn test_invalid_start_followed_by_space() {
486        for c in 0xf5..0x100 {
487            let c = c as u8;
488
489            let mut d = UTF8Encoding.raw_decoder();
490            assert_feed_err!(d, [], [c], [0x20], "");
491            assert_finish_ok!(d, "");
492
493            let mut d = UTF8Encoding.raw_decoder();
494            assert_feed_err!(d, [], [c], [], "");
495            assert_feed_ok!(d, [0x20], [], "\x20");
496            assert_finish_ok!(d, "");
497        }
498    }
499
500    #[test]
501    fn test_invalid_lone_start_immediate_test_finish() {
502        for c in 0xc2..0xf5 {
503            let mut d = UTF8Encoding.raw_decoder();
504            assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
505            assert_finish_err!(d, "");
506        }
507    }
508
509    #[test]
510    fn test_invalid_lone_start_followed_by_space() {
511        for c in 0xc2..0xf5 {
512            let mut d = UTF8Encoding.raw_decoder();
513            assert_feed_err!(d, [], [c], [0x20], "");
514            assert_finish_ok!(d, "");
515
516            let mut d = UTF8Encoding.raw_decoder();
517            assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
518            assert_feed_err!(d, [], [], [0x20], "");
519            assert_finish_ok!(d, "");
520        }
521    }
522
523    #[test]
524    fn test_invalid_incomplete_three_byte_seq_followed_by_space() {
525        for b in 0xe0..0xf5 {
526            let c = if b == 0xe0 || b == 0xf0 { 0xa0 } else { 0x80 };
527
528            let mut d = UTF8Encoding.raw_decoder();
529            assert_feed_err!(d, [], [b, c], [0x20], "");
530            assert_finish_ok!(d, "");
531
532            let mut d = UTF8Encoding.raw_decoder();
533            assert_feed_ok!(d, [], [b, c], ""); // wait for cont. bytes
534            assert_feed_err!(d, [], [], [0x20], "");
535            assert_finish_ok!(d, "");
536
537            let mut d = UTF8Encoding.raw_decoder();
538            assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
539            assert_feed_err!(d, [], [c], [0x20], "");
540            assert_finish_ok!(d, "");
541
542            let mut d = UTF8Encoding.raw_decoder();
543            assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
544            assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
545            assert_feed_err!(d, [], [], [0x20], "");
546            assert_finish_ok!(d, "");
547        }
548    }
549
550    #[test]
551    fn test_invalid_incomplete_four_byte_seq_followed_by_space() {
552        for a in 0xf0..0xf5 {
553            let b = if a == 0xf0 { 0xa0 } else { 0x80 };
554            let c = 0x80;
555
556            let mut d = UTF8Encoding.raw_decoder();
557            assert_feed_err!(d, [], [a, b, c], [0x20], "");
558            assert_finish_ok!(d, "");
559
560            let mut d = UTF8Encoding.raw_decoder();
561            assert_feed_ok!(d, [], [a], ""); // wait for cont. bytes
562            assert_feed_ok!(d, [], [b], ""); // wait for cont. bytes
563            assert_feed_ok!(d, [], [c], ""); // wait for cont. bytes
564            assert_feed_err!(d, [], [], [0x20], "");
565            assert_finish_ok!(d, "");
566
567            let mut d = UTF8Encoding.raw_decoder();
568            assert_feed_ok!(d, [], [a, b], ""); // wait for cont. bytes
569            assert_feed_err!(d, [], [c], [0x20], "");
570            assert_finish_ok!(d, "");
571
572            let mut d = UTF8Encoding.raw_decoder();
573            assert_feed_ok!(d, [], [a, b, c], ""); // wait for cont. bytes
574            assert_feed_err!(d, [], [], [0x20], "");
575            assert_finish_ok!(d, "");
576        }
577    }
578
579    #[test]
580    fn test_invalid_too_many_cont_bytes() {
581        let mut d = UTF8Encoding.raw_decoder();
582        assert_feed_err!(d, [0xc2, 0x80], [0x80], [], "\u{80}");
583        assert_finish_ok!(d, "");
584
585        let mut d = UTF8Encoding.raw_decoder();
586        assert_feed_err!(d, [0xe0, 0xa0, 0x80], [0x80], [], "\u{800}");
587        assert_finish_ok!(d, "");
588
589        let mut d = UTF8Encoding.raw_decoder();
590        assert_feed_err!(d, [0xf0, 0x90, 0x80, 0x80], [0x80], [], "\u{10000}");
591        assert_finish_ok!(d, "");
592
593        // no continuation byte is consumed after 5/6-byte sequence starters and FE/FF
594        let mut d = UTF8Encoding.raw_decoder();
595        assert_feed_err!(d, [], [0xf8], [0x88, 0x80, 0x80, 0x80, 0x80], "");
596        assert_finish_ok!(d, "");
597
598        let mut d = UTF8Encoding.raw_decoder();
599        assert_feed_err!(d, [], [0xfc], [0x84, 0x80, 0x80, 0x80, 0x80, 0x80], "");
600        assert_finish_ok!(d, "");
601
602        let mut d = UTF8Encoding.raw_decoder();
603        assert_feed_err!(d, [], [0xfe], [0x80], "");
604        assert_finish_ok!(d, "");
605
606        let mut d = UTF8Encoding.raw_decoder();
607        assert_feed_err!(d, [], [0xff], [0x80], "");
608        assert_finish_ok!(d, "");
609    }
610
611    #[test]
612    fn test_invalid_too_many_cont_bytes_partial() {
613        let mut d = UTF8Encoding.raw_decoder();
614        assert_feed_ok!(d, [], [0xc2], "");
615        assert_feed_err!(d, [0x80], [0x80], [], "\u{80}");
616        assert_finish_ok!(d, "");
617
618        let mut d = UTF8Encoding.raw_decoder();
619        assert_feed_ok!(d, [], [0xe0, 0xa0], "");
620        assert_feed_err!(d, [0x80], [0x80], [], "\u{800}");
621        assert_finish_ok!(d, "");
622
623        let mut d = UTF8Encoding.raw_decoder();
624        assert_feed_ok!(d, [], [0xf0, 0x90, 0x80], "");
625        assert_feed_err!(d, [0x80], [0x80], [], "\u{10000}");
626        assert_finish_ok!(d, "");
627
628        // no continuation byte is consumed after 5/6-byte sequence starters and FE/FF
629        let mut d = UTF8Encoding.raw_decoder();
630        assert_feed_err!(d, [], [0xf8], [], "");
631        assert_feed_err!(d, [], [0x88], [0x80, 0x80, 0x80, 0x80], "");
632        assert_finish_ok!(d, "");
633
634        let mut d = UTF8Encoding.raw_decoder();
635        assert_feed_err!(d, [], [0xfc], [], "");
636        assert_feed_err!(d, [], [0x84], [0x80, 0x80, 0x80, 0x80, 0x80], "");
637        assert_finish_ok!(d, "");
638
639        let mut d = UTF8Encoding.raw_decoder();
640        assert_feed_err!(d, [], [0xfe], [], "");
641        assert_feed_err!(d, [], [0x80], [], "");
642        assert_finish_ok!(d, "");
643
644        let mut d = UTF8Encoding.raw_decoder();
645        assert_feed_err!(d, [], [0xff], [], "");
646        assert_feed_err!(d, [], [0x80], [], "");
647        assert_finish_ok!(d, "");
648    }
649
650    #[test]
651    fn test_invalid_overlong_minimal() {
652        let mut d = UTF8Encoding.raw_decoder();
653        assert_feed_err!(d, [], [0xc0], [0x80], "");
654        assert_finish_ok!(d, "");
655
656        let mut d = UTF8Encoding.raw_decoder();
657        assert_feed_err!(d, [], [0xe0], [0x80, 0x80], "");
658        assert_finish_ok!(d, "");
659
660        let mut d = UTF8Encoding.raw_decoder();
661        assert_feed_err!(d, [], [0xf0], [0x80, 0x80, 0x80], "");
662        assert_finish_ok!(d, "");
663    }
664
665    #[test]
666    fn test_invalid_overlong_maximal() {
667        let mut d = UTF8Encoding.raw_decoder();
668        assert_feed_err!(d, [], [0xc1], [0xbf], "");
669        assert_finish_ok!(d, "");
670
671        let mut d = UTF8Encoding.raw_decoder();
672        assert_feed_err!(d, [], [0xe0], [0x9f, 0xbf], "");
673        assert_finish_ok!(d, "");
674
675        let mut d = UTF8Encoding.raw_decoder();
676        assert_feed_err!(d, [], [0xf0], [0x8f, 0xbf, 0xbf], "");
677        assert_finish_ok!(d, "");
678    }
679
680    #[test]
681    fn test_feed_after_finish() {
682        let mut d = UTF8Encoding.raw_decoder();
683        assert_feed_ok!(d, [0xc2, 0x80], [0xc2], "\u{80}");
684        assert_finish_err!(d, "");
685        assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}");
686        assert_finish_ok!(d, "");
687    }
688
689    #[test]
690    fn test_correct_from_utf8() {
691        let s = testutils::ASCII_TEXT.as_bytes();
692        assert_eq!(from_utf8(s), str::from_utf8(s).ok());
693
694        let s = testutils::KOREAN_TEXT.as_bytes();
695        assert_eq!(from_utf8(s), str::from_utf8(s).ok());
696
697        let s = testutils::INVALID_UTF8_TEXT;
698        assert_eq!(from_utf8(s), str::from_utf8(s).ok());
699    }
700
701    mod bench_ascii {
702        extern crate test;
703        use super::super::{from_utf8, UTF8Encoding};
704        use crate::testutils;
705        use crate::types::*;
706        use std::str;
707
708        #[bench]
709        fn bench_encode(bencher: &mut test::Bencher) {
710            let s = testutils::ASCII_TEXT;
711            bencher.bytes = s.len() as u64;
712            bencher.iter(|| test::black_box(UTF8Encoding.encode(s, EncoderTrap::Strict)))
713        }
714
715        #[bench]
716        fn bench_decode(bencher: &mut test::Bencher) {
717            let s = testutils::ASCII_TEXT.as_bytes();
718            bencher.bytes = s.len() as u64;
719            bencher.iter(|| test::black_box(UTF8Encoding.decode(s, DecoderTrap::Strict)))
720        }
721
722        #[bench]
723        fn bench_from_utf8(bencher: &mut test::Bencher) {
724            let s = testutils::ASCII_TEXT.as_bytes();
725            bencher.bytes = s.len() as u64;
726            bencher.iter(|| test::black_box(from_utf8(s)))
727        }
728
729        #[bench] // for the comparison
730        fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) {
731            let s = testutils::ASCII_TEXT.as_bytes();
732            bencher.bytes = s.len() as u64;
733            bencher.iter(|| test::black_box(str::from_utf8(s)))
734        }
735
736        #[bench] // for the comparison
737        fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
738            let s = testutils::ASCII_TEXT.as_bytes();
739            bencher.bytes = s.len() as u64;
740            bencher.iter(|| test::black_box(String::from_utf8_lossy(s)))
741        }
742    }
743
744    // why Korean? it has an excellent mix of multibyte sequences and ASCII sequences
745    // unlike other CJK scripts, so it reflects a practical use case a bit better.
746    mod bench_korean {
747        extern crate test;
748        use super::super::{from_utf8, UTF8Encoding};
749        use crate::testutils;
750        use crate::types::*;
751        use std::str;
752
753        #[bench]
754        fn bench_encode(bencher: &mut test::Bencher) {
755            let s = testutils::KOREAN_TEXT;
756            bencher.bytes = s.len() as u64;
757            bencher.iter(|| test::black_box(UTF8Encoding.encode(s, EncoderTrap::Strict)))
758        }
759
760        #[bench]
761        fn bench_decode(bencher: &mut test::Bencher) {
762            let s = testutils::KOREAN_TEXT.as_bytes();
763            bencher.bytes = s.len() as u64;
764            bencher.iter(|| test::black_box(UTF8Encoding.decode(s, DecoderTrap::Strict)))
765        }
766
767        #[bench]
768        fn bench_from_utf8(bencher: &mut test::Bencher) {
769            let s = testutils::KOREAN_TEXT.as_bytes();
770            bencher.bytes = s.len() as u64;
771            bencher.iter(|| test::black_box(from_utf8(s)))
772        }
773
774        #[bench] // for the comparison
775        fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) {
776            let s = testutils::KOREAN_TEXT.as_bytes();
777            bencher.bytes = s.len() as u64;
778            bencher.iter(|| test::black_box(str::from_utf8(s)))
779        }
780
781        #[bench] // for the comparison
782        fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
783            let s = testutils::KOREAN_TEXT.as_bytes();
784            bencher.bytes = s.len() as u64;
785            bencher.iter(|| test::black_box(String::from_utf8_lossy(s)))
786        }
787    }
788
789    mod bench_lossy_invalid {
790        extern crate test;
791        use super::super::{from_utf8, UTF8Encoding};
792        use crate::testutils;
793        use crate::types::DecoderTrap::Replace as DecodeReplace;
794        use crate::types::*;
795        use std::str;
796
797        #[bench]
798        fn bench_decode_replace(bencher: &mut test::Bencher) {
799            let s = testutils::INVALID_UTF8_TEXT;
800            bencher.bytes = s.len() as u64;
801            bencher.iter(|| test::black_box(UTF8Encoding.decode(s, DecodeReplace)))
802        }
803
804        #[bench] // for the comparison
805        fn bench_from_utf8_failing(bencher: &mut test::Bencher) {
806            let s = testutils::INVALID_UTF8_TEXT;
807            bencher.bytes = s.len() as u64;
808            bencher.iter(|| test::black_box(from_utf8(s)))
809        }
810
811        #[bench] // for the comparison
812        fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) {
813            let s = testutils::INVALID_UTF8_TEXT;
814            bencher.bytes = s.len() as u64;
815            bencher.iter(|| test::black_box(str::from_utf8(s)))
816        }
817
818        #[bench] // for the comparison
819        fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
820            let s = testutils::INVALID_UTF8_TEXT;
821            bencher.bytes = s.len() as u64;
822            bencher.iter(|| test::black_box(String::from_utf8_lossy(s)))
823        }
824    }
825
826    mod bench_lossy_external {
827        extern crate test;
828        use super::super::{from_utf8, UTF8Encoding};
829        use crate::testutils;
830        use crate::types::DecoderTrap::Replace as DecodeReplace;
831        use crate::types::*;
832        use std::str;
833
834        #[bench]
835        fn bench_decode_replace(bencher: &mut test::Bencher) {
836            let s = testutils::get_external_bench_data();
837            bencher.bytes = s.len() as u64;
838            bencher.iter(|| test::black_box(UTF8Encoding.decode(&s, DecodeReplace)))
839        }
840
841        #[bench] // for the comparison
842        fn bench_from_utf8_failing(bencher: &mut test::Bencher) {
843            let s = testutils::get_external_bench_data();
844            bencher.bytes = s.len() as u64;
845            bencher.iter(|| test::black_box(from_utf8(&s)))
846        }
847
848        #[bench] // for the comparison
849        fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) {
850            let s = testutils::get_external_bench_data();
851            bencher.bytes = s.len() as u64;
852            bencher.iter(|| test::black_box(str::from_utf8(&s)))
853        }
854
855        #[bench] // for the comparison
856        fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
857            let s = testutils::get_external_bench_data();
858            bencher.bytes = s.len() as u64;
859            bencher.iter(|| test::black_box(String::from_utf8_lossy(&s)))
860        }
861    }
862}
encoding/codec/utf_8.rs

encoding/codec/
utf_8.rs