1use crate::types::*;
28use std::convert::Into;
29use std::str;
30
31#[derive(Clone, Copy)]
47pub struct UTF8Encoding;
48
49impl Encoding for UTF8Encoding {
50 fn name(&self) -> &'static str {
51 "utf-8"
52 }
53 fn whatwg_name(&self) -> Option<&'static str> {
54 Some("utf-8")
55 }
56 fn raw_encoder(&self) -> Box<dyn RawEncoder> {
57 UTF8Encoder::new()
58 }
59 fn raw_decoder(&self) -> Box<dyn RawDecoder> {
60 UTF8Decoder::new()
61 }
62}
63
64#[derive(Clone, Copy)]
66pub struct UTF8Encoder;
67
68impl UTF8Encoder {
69 #[allow(clippy::new_ret_no_self)]
70 pub fn new() -> Box<dyn RawEncoder> {
71 Box::new(UTF8Encoder)
72 }
73}
74
75impl RawEncoder for UTF8Encoder {
76 fn from_self(&self) -> Box<dyn RawEncoder> {
77 UTF8Encoder::new()
78 }
79 fn is_ascii_compatible(&self) -> bool {
80 true
81 }
82
83 fn raw_feed(
84 &mut self,
85 input: &str,
86 output: &mut dyn ByteWriter,
87 ) -> (usize, Option<CodecError>) {
88 let input: &[u8] = input.as_bytes();
89 assert!(str::from_utf8(input).is_ok());
90 output.write_bytes(input);
91 (input.len(), None)
92 }
93
94 fn raw_finish(&mut self, _output: &mut dyn ByteWriter) -> Option<CodecError> {
95 None
96 }
97}
98
99#[derive(Clone, Copy)]
101pub struct UTF8Decoder {
102 queuelen: usize,
103 queue: [u8; 4],
104 state: u8,
105}
106
107impl UTF8Decoder {
108 #[allow(clippy::new_ret_no_self)]
109 pub fn new() -> Box<dyn RawDecoder> {
110 Box::new(UTF8Decoder {
111 queuelen: 0,
112 queue: [0; 4],
113 state: INITIAL_STATE,
114 })
115 }
116}
117
118static CHAR_CATEGORY: [u8; 256] = [
119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
136 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
137 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
138 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
139 8,
140];
141
142static STATE_TRANSITIONS: [u8; 110] = [
143 0, 98, 12, 24, 48, 84, 72, 98, 98, 98, 36, 60, 86, 0, 86, 86, 86, 86, 86, 0, 86, 0, 86, 86, 86, 12, 86, 86, 86, 86, 86, 12, 86, 12, 86, 86, 86, 86, 86, 86, 86, 86, 86, 12, 86, 86, 86, 86, 86, 12, 86, 86, 86, 86, 86, 86, 86, 12, 86, 86, 86, 86, 86, 86, 86, 86, 86, 24, 86, 24, 86, 86, 86, 24, 86, 86, 86, 86, 86, 24, 86, 24, 86, 86, 86, 24, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 86, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, 98, ];
154
155static INITIAL_STATE: u8 = 0;
156static ACCEPT_STATE: u8 = 0;
157static REJECT_STATE: u8 = 98;
158static REJECT_STATE_WITH_BACKUP: u8 = 86;
159
160macro_rules! is_reject_state(($state:expr) => ($state >= REJECT_STATE_WITH_BACKUP));
161macro_rules! next_state(($state:expr, $ch:expr) => (
162 STATE_TRANSITIONS[($state + CHAR_CATEGORY[$ch as usize]) as usize]
163));
164
165impl RawDecoder for UTF8Decoder {
166 fn from_self(&self) -> Box<dyn RawDecoder> {
167 UTF8Decoder::new()
168 }
169 fn is_ascii_compatible(&self) -> bool {
170 true
171 }
172
173 fn raw_feed(
174 &mut self,
175 input: &[u8],
176 output: &mut dyn StringWriter,
177 ) -> (usize, Option<CodecError>) {
178 output.writer_hint(input.len());
179
180 fn write_bytes(output: &mut dyn StringWriter, bytes: &[u8]) {
181 output.write_str(unsafe { std::str::from_utf8_unchecked(bytes) });
182 }
183
184 let mut state = self.state;
185 let mut processed = 0;
186 let mut offset = 0;
187
188 if state == INITIAL_STATE {
190 let first_msb = input
191 .iter()
192 .position(|&ch| ch >= 0x80)
193 .unwrap_or(input.len());
194 offset += first_msb;
195 processed += first_msb;
196 }
197
198 for (i, &ch) in input[offset..].iter().enumerate() {
199 state = next_state!(state, ch);
200 if state == ACCEPT_STATE {
201 processed = i + offset + 1;
202 } else if is_reject_state!(state) {
203 let upto = if state == REJECT_STATE {
204 i + offset + 1
205 } else {
206 i + offset
207 };
208 self.state = INITIAL_STATE;
209 if processed > 0 && self.queuelen > 0 {
210 write_bytes(output, &self.queue[0..self.queuelen]);
212 }
213 self.queuelen = 0;
214 write_bytes(output, &input[0..processed]);
215 return (
216 processed,
217 Some(CodecError {
218 upto: upto as isize,
219 cause: "invalid sequence".into(),
220 }),
221 );
222 }
223 }
224
225 self.state = state;
226 if processed > 0 && self.queuelen > 0 {
227 write_bytes(output, &self.queue[0..self.queuelen]);
229 self.queuelen = 0;
230 }
231 write_bytes(output, &input[0..processed]);
232 if processed < input.len() {
233 let morequeuelen = input.len() - processed;
234 for i in 0..morequeuelen {
235 self.queue[self.queuelen + i] = input[processed + i];
236 }
237 self.queuelen += morequeuelen;
238 }
239 (processed, None)
240 }
241
242 fn raw_finish(&mut self, _output: &mut dyn StringWriter) -> Option<CodecError> {
243 let state = self.state;
244 let queuelen = self.queuelen;
245 self.state = INITIAL_STATE;
246 self.queuelen = 0;
247 if state != ACCEPT_STATE {
248 Some(CodecError {
249 upto: 0,
250 cause: "incomplete sequence".into(),
251 })
252 } else {
253 assert!(queuelen == 0);
254 None
255 }
256 }
257}
258
259pub fn from_utf8(input: &[u8]) -> Option<&str> {
263 let mut iter = input.iter();
264 let mut state;
265
266 macro_rules! return_as_whole(() => (return Some(unsafe {std::str::from_utf8_unchecked(input)})));
267
268 loop {
270 match iter.next() {
271 Some(&ch) if ch < 0x80 => {}
272 Some(&ch) => {
273 state = next_state!(INITIAL_STATE, ch);
274 break;
275 }
276 None => {
277 return_as_whole!();
278 }
279 }
280 }
281
282 for &ch in iter {
283 state = next_state!(state, ch);
284 if is_reject_state!(state) {
285 return None;
286 }
287 }
288 if state != ACCEPT_STATE {
289 return None;
290 }
291 return_as_whole!();
292}
293
294#[cfg(test)]
295mod tests {
296 use super::{from_utf8, UTF8Encoding};
300 use crate::testutils;
301 use crate::types::*;
302 use std::str;
303
304 #[test]
305 fn test_valid() {
306 let mut d = UTF8Encoding.raw_decoder();
308 assert_feed_ok!(d, [0x41], [], "A");
309 assert_feed_ok!(d, [0x42, 0x43], [], "BC");
310 assert_feed_ok!(d, [], [], "");
311 assert_feed_ok!(d, [0x44, 0x45, 0x46], [], "DEF");
312 assert_finish_ok!(d, "");
313
314 let mut d = UTF8Encoding.raw_decoder();
316 assert_feed_ok!(d, [0xc2, 0xa2], [], "\u{a2}");
317 assert_feed_ok!(d, [0xc2, 0xac, 0xc2, 0xa9], [], "\u{ac}\u{0a9}");
318 assert_feed_ok!(d, [], [], "");
319 assert_feed_ok!(
320 d,
321 [
322 0xd5, 0xa1, 0xd5, 0xb5, 0xd5, 0xa2, 0xd5, 0xb8, 0xd6, 0x82, 0xd5, 0xa2, 0xd5, 0xa5,
323 0xd5, 0xb6
324 ],
325 [],
326 "\u{561}\u{0575}\u{562}\u{578}\u{582}\u{562}\u{565}\u{576}"
327 );
328 assert_finish_ok!(d, "");
329
330 let mut d = UTF8Encoding.raw_decoder();
332 assert_feed_ok!(d, [0xed, 0x92, 0x89], [], "\u{d489}");
333 assert_feed_ok!(
334 d,
335 [0xe6, 0xbc, 0xa2, 0xe5, 0xad, 0x97],
336 [],
337 "\u{6f22}\u{5b57}"
338 );
339 assert_feed_ok!(d, [], [], "");
340 assert_feed_ok!(
341 d,
342 [0xc9, 0x99, 0xc9, 0x94, 0xc9, 0x90],
343 [],
344 "\u{259}\u{0254}\u{250}"
345 );
346 assert_finish_ok!(d, "");
347
348 let mut d = UTF8Encoding.raw_decoder();
350 assert_feed_ok!(d, [0xf0, 0x90, 0x82, 0x82], [], "\u{10082}");
351 assert_feed_ok!(d, [], [], "");
352 assert_finish_ok!(d, "");
353
354 }
356
357 #[test]
358 fn test_valid_boundary() {
359 let mut d = UTF8Encoding.raw_decoder();
360 assert_feed_ok!(d, [0x00], [], "\x00");
361 assert_finish_ok!(d, "");
362
363 let mut d = UTF8Encoding.raw_decoder();
364 assert_feed_ok!(d, [0x7f], [], "\x7f");
365 assert_finish_ok!(d, "");
366
367 let mut d = UTF8Encoding.raw_decoder();
368 assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}");
369 assert_finish_ok!(d, "");
370
371 let mut d = UTF8Encoding.raw_decoder();
372 assert_feed_ok!(d, [0xdf, 0xbf], [], "\u{7ff}");
373 assert_finish_ok!(d, "");
374
375 let mut d = UTF8Encoding.raw_decoder();
376 assert_feed_ok!(d, [0xe0, 0xa0, 0x80], [], "\u{800}");
377 assert_finish_ok!(d, "");
378
379 let mut d = UTF8Encoding.raw_decoder();
380 assert_feed_ok!(d, [0xed, 0x9f, 0xbf], [], "\u{d7ff}");
381 assert_finish_ok!(d, "");
382
383 let mut d = UTF8Encoding.raw_decoder();
384 assert_feed_ok!(d, [0xee, 0x80, 0x80], [], "\u{e000}");
385 assert_finish_ok!(d, "");
386
387 let mut d = UTF8Encoding.raw_decoder();
388 assert_feed_ok!(d, [0xef, 0xbf, 0xbf], [], "\u{ffff}");
389 assert_finish_ok!(d, "");
390
391 let mut d = UTF8Encoding.raw_decoder();
392 assert_feed_ok!(d, [0xf0, 0x90, 0x80, 0x80], [], "\u{10000}");
393 assert_finish_ok!(d, "");
394
395 let mut d = UTF8Encoding.raw_decoder();
396 assert_feed_ok!(d, [0xf4, 0x8f, 0xbf, 0xbf], [], "\u{10ffff}");
397 assert_finish_ok!(d, "");
398 }
399
400 #[test]
401 fn test_valid_partial() {
402 let mut d = UTF8Encoding.raw_decoder();
403 assert_feed_ok!(d, [], [0xf0], "");
404 assert_feed_ok!(d, [], [0x90], "");
405 assert_feed_ok!(d, [], [0x82], "");
406 assert_feed_ok!(d, [0x82], [0xed], "\u{10082}");
407 assert_feed_ok!(d, [0x92, 0x89], [], "\u{d489}");
408 assert_finish_ok!(d, "");
409
410 let mut d = UTF8Encoding.raw_decoder();
411 assert_feed_ok!(d, [], [0xc2], "");
412 assert_feed_ok!(d, [0xa9, 0x20], [], "\u{a9}\u{020}");
413 assert_finish_ok!(d, "");
414 }
415
416 #[test]
417 fn test_invalid_continuation() {
418 for c in 0x80..0xc0 {
419 let mut d = UTF8Encoding.raw_decoder();
420 assert_feed_err!(d, [], [c], [], "");
421 assert_finish_ok!(d, "");
422
423 let mut d = UTF8Encoding.raw_decoder();
424 assert_feed_err!(d, [], [c], [c], "");
425 assert_finish_ok!(d, "");
426
427 let mut d = UTF8Encoding.raw_decoder();
428 assert_feed_err!(d, [], [c], [c, c], "");
429 assert_finish_ok!(d, "");
430 }
431 }
432
433 #[test]
434 fn test_invalid_surrogate() {
435 let mut d = UTF8Encoding.raw_decoder();
438 assert_feed_err!(d, [], [0xed], [0xa0, 0x80], "");
439 assert_finish_ok!(d, "");
440
441 let mut d = UTF8Encoding.raw_decoder();
442 assert_feed_err!(d, [], [0xed], [0xad, 0xbf], "");
443 assert_finish_ok!(d, "");
444
445 let mut d = UTF8Encoding.raw_decoder();
446 assert_feed_err!(d, [], [0xed], [0xae, 0x80], "");
447 assert_finish_ok!(d, "");
448
449 let mut d = UTF8Encoding.raw_decoder();
450 assert_feed_err!(d, [], [0xed], [0xaf, 0xbf], "");
451 assert_finish_ok!(d, "");
452
453 let mut d = UTF8Encoding.raw_decoder();
454 assert_feed_err!(d, [], [0xed], [0xb0, 0x80], "");
455 assert_finish_ok!(d, "");
456
457 let mut d = UTF8Encoding.raw_decoder();
458 assert_feed_err!(d, [], [0xed], [0xbe, 0x80], "");
459 assert_finish_ok!(d, "");
460
461 let mut d = UTF8Encoding.raw_decoder();
462 assert_feed_err!(d, [], [0xed], [0xbf, 0xbf], "");
463 assert_finish_ok!(d, "");
464 }
465
466 #[test]
467 fn test_invalid_boundary() {
468 let mut d = UTF8Encoding.raw_decoder();
470 assert_feed_err!(d, [], [0xf4], [0x90, 0x90, 0x90], ""); assert_finish_ok!(d, "");
472 }
473
474 #[test]
475 fn test_invalid_start_immediate_test_finish() {
476 for c in 0xf5..0x100 {
477 let c = c as u8;
478 let mut d = UTF8Encoding.raw_decoder();
479 assert_feed_err!(d, [], [c], [], "");
480 assert_finish_ok!(d, "");
481 }
482 }
483
484 #[test]
485 fn test_invalid_start_followed_by_space() {
486 for c in 0xf5..0x100 {
487 let c = c as u8;
488
489 let mut d = UTF8Encoding.raw_decoder();
490 assert_feed_err!(d, [], [c], [0x20], "");
491 assert_finish_ok!(d, "");
492
493 let mut d = UTF8Encoding.raw_decoder();
494 assert_feed_err!(d, [], [c], [], "");
495 assert_feed_ok!(d, [0x20], [], "\x20");
496 assert_finish_ok!(d, "");
497 }
498 }
499
500 #[test]
501 fn test_invalid_lone_start_immediate_test_finish() {
502 for c in 0xc2..0xf5 {
503 let mut d = UTF8Encoding.raw_decoder();
504 assert_feed_ok!(d, [], [c], ""); assert_finish_err!(d, "");
506 }
507 }
508
509 #[test]
510 fn test_invalid_lone_start_followed_by_space() {
511 for c in 0xc2..0xf5 {
512 let mut d = UTF8Encoding.raw_decoder();
513 assert_feed_err!(d, [], [c], [0x20], "");
514 assert_finish_ok!(d, "");
515
516 let mut d = UTF8Encoding.raw_decoder();
517 assert_feed_ok!(d, [], [c], ""); assert_feed_err!(d, [], [], [0x20], "");
519 assert_finish_ok!(d, "");
520 }
521 }
522
523 #[test]
524 fn test_invalid_incomplete_three_byte_seq_followed_by_space() {
525 for b in 0xe0..0xf5 {
526 let c = if b == 0xe0 || b == 0xf0 { 0xa0 } else { 0x80 };
527
528 let mut d = UTF8Encoding.raw_decoder();
529 assert_feed_err!(d, [], [b, c], [0x20], "");
530 assert_finish_ok!(d, "");
531
532 let mut d = UTF8Encoding.raw_decoder();
533 assert_feed_ok!(d, [], [b, c], ""); assert_feed_err!(d, [], [], [0x20], "");
535 assert_finish_ok!(d, "");
536
537 let mut d = UTF8Encoding.raw_decoder();
538 assert_feed_ok!(d, [], [b], ""); assert_feed_err!(d, [], [c], [0x20], "");
540 assert_finish_ok!(d, "");
541
542 let mut d = UTF8Encoding.raw_decoder();
543 assert_feed_ok!(d, [], [b], ""); assert_feed_ok!(d, [], [c], ""); assert_feed_err!(d, [], [], [0x20], "");
546 assert_finish_ok!(d, "");
547 }
548 }
549
550 #[test]
551 fn test_invalid_incomplete_four_byte_seq_followed_by_space() {
552 for a in 0xf0..0xf5 {
553 let b = if a == 0xf0 { 0xa0 } else { 0x80 };
554 let c = 0x80;
555
556 let mut d = UTF8Encoding.raw_decoder();
557 assert_feed_err!(d, [], [a, b, c], [0x20], "");
558 assert_finish_ok!(d, "");
559
560 let mut d = UTF8Encoding.raw_decoder();
561 assert_feed_ok!(d, [], [a], ""); assert_feed_ok!(d, [], [b], ""); assert_feed_ok!(d, [], [c], ""); assert_feed_err!(d, [], [], [0x20], "");
565 assert_finish_ok!(d, "");
566
567 let mut d = UTF8Encoding.raw_decoder();
568 assert_feed_ok!(d, [], [a, b], ""); assert_feed_err!(d, [], [c], [0x20], "");
570 assert_finish_ok!(d, "");
571
572 let mut d = UTF8Encoding.raw_decoder();
573 assert_feed_ok!(d, [], [a, b, c], ""); assert_feed_err!(d, [], [], [0x20], "");
575 assert_finish_ok!(d, "");
576 }
577 }
578
579 #[test]
580 fn test_invalid_too_many_cont_bytes() {
581 let mut d = UTF8Encoding.raw_decoder();
582 assert_feed_err!(d, [0xc2, 0x80], [0x80], [], "\u{80}");
583 assert_finish_ok!(d, "");
584
585 let mut d = UTF8Encoding.raw_decoder();
586 assert_feed_err!(d, [0xe0, 0xa0, 0x80], [0x80], [], "\u{800}");
587 assert_finish_ok!(d, "");
588
589 let mut d = UTF8Encoding.raw_decoder();
590 assert_feed_err!(d, [0xf0, 0x90, 0x80, 0x80], [0x80], [], "\u{10000}");
591 assert_finish_ok!(d, "");
592
593 let mut d = UTF8Encoding.raw_decoder();
595 assert_feed_err!(d, [], [0xf8], [0x88, 0x80, 0x80, 0x80, 0x80], "");
596 assert_finish_ok!(d, "");
597
598 let mut d = UTF8Encoding.raw_decoder();
599 assert_feed_err!(d, [], [0xfc], [0x84, 0x80, 0x80, 0x80, 0x80, 0x80], "");
600 assert_finish_ok!(d, "");
601
602 let mut d = UTF8Encoding.raw_decoder();
603 assert_feed_err!(d, [], [0xfe], [0x80], "");
604 assert_finish_ok!(d, "");
605
606 let mut d = UTF8Encoding.raw_decoder();
607 assert_feed_err!(d, [], [0xff], [0x80], "");
608 assert_finish_ok!(d, "");
609 }
610
611 #[test]
612 fn test_invalid_too_many_cont_bytes_partial() {
613 let mut d = UTF8Encoding.raw_decoder();
614 assert_feed_ok!(d, [], [0xc2], "");
615 assert_feed_err!(d, [0x80], [0x80], [], "\u{80}");
616 assert_finish_ok!(d, "");
617
618 let mut d = UTF8Encoding.raw_decoder();
619 assert_feed_ok!(d, [], [0xe0, 0xa0], "");
620 assert_feed_err!(d, [0x80], [0x80], [], "\u{800}");
621 assert_finish_ok!(d, "");
622
623 let mut d = UTF8Encoding.raw_decoder();
624 assert_feed_ok!(d, [], [0xf0, 0x90, 0x80], "");
625 assert_feed_err!(d, [0x80], [0x80], [], "\u{10000}");
626 assert_finish_ok!(d, "");
627
628 let mut d = UTF8Encoding.raw_decoder();
630 assert_feed_err!(d, [], [0xf8], [], "");
631 assert_feed_err!(d, [], [0x88], [0x80, 0x80, 0x80, 0x80], "");
632 assert_finish_ok!(d, "");
633
634 let mut d = UTF8Encoding.raw_decoder();
635 assert_feed_err!(d, [], [0xfc], [], "");
636 assert_feed_err!(d, [], [0x84], [0x80, 0x80, 0x80, 0x80, 0x80], "");
637 assert_finish_ok!(d, "");
638
639 let mut d = UTF8Encoding.raw_decoder();
640 assert_feed_err!(d, [], [0xfe], [], "");
641 assert_feed_err!(d, [], [0x80], [], "");
642 assert_finish_ok!(d, "");
643
644 let mut d = UTF8Encoding.raw_decoder();
645 assert_feed_err!(d, [], [0xff], [], "");
646 assert_feed_err!(d, [], [0x80], [], "");
647 assert_finish_ok!(d, "");
648 }
649
650 #[test]
651 fn test_invalid_overlong_minimal() {
652 let mut d = UTF8Encoding.raw_decoder();
653 assert_feed_err!(d, [], [0xc0], [0x80], "");
654 assert_finish_ok!(d, "");
655
656 let mut d = UTF8Encoding.raw_decoder();
657 assert_feed_err!(d, [], [0xe0], [0x80, 0x80], "");
658 assert_finish_ok!(d, "");
659
660 let mut d = UTF8Encoding.raw_decoder();
661 assert_feed_err!(d, [], [0xf0], [0x80, 0x80, 0x80], "");
662 assert_finish_ok!(d, "");
663 }
664
665 #[test]
666 fn test_invalid_overlong_maximal() {
667 let mut d = UTF8Encoding.raw_decoder();
668 assert_feed_err!(d, [], [0xc1], [0xbf], "");
669 assert_finish_ok!(d, "");
670
671 let mut d = UTF8Encoding.raw_decoder();
672 assert_feed_err!(d, [], [0xe0], [0x9f, 0xbf], "");
673 assert_finish_ok!(d, "");
674
675 let mut d = UTF8Encoding.raw_decoder();
676 assert_feed_err!(d, [], [0xf0], [0x8f, 0xbf, 0xbf], "");
677 assert_finish_ok!(d, "");
678 }
679
680 #[test]
681 fn test_feed_after_finish() {
682 let mut d = UTF8Encoding.raw_decoder();
683 assert_feed_ok!(d, [0xc2, 0x80], [0xc2], "\u{80}");
684 assert_finish_err!(d, "");
685 assert_feed_ok!(d, [0xc2, 0x80], [], "\u{80}");
686 assert_finish_ok!(d, "");
687 }
688
689 #[test]
690 fn test_correct_from_utf8() {
691 let s = testutils::ASCII_TEXT.as_bytes();
692 assert_eq!(from_utf8(s), str::from_utf8(s).ok());
693
694 let s = testutils::KOREAN_TEXT.as_bytes();
695 assert_eq!(from_utf8(s), str::from_utf8(s).ok());
696
697 let s = testutils::INVALID_UTF8_TEXT;
698 assert_eq!(from_utf8(s), str::from_utf8(s).ok());
699 }
700
701 mod bench_ascii {
702 extern crate test;
703 use super::super::{from_utf8, UTF8Encoding};
704 use crate::testutils;
705 use crate::types::*;
706 use std::str;
707
708 #[bench]
709 fn bench_encode(bencher: &mut test::Bencher) {
710 let s = testutils::ASCII_TEXT;
711 bencher.bytes = s.len() as u64;
712 bencher.iter(|| test::black_box(UTF8Encoding.encode(s, EncoderTrap::Strict)))
713 }
714
715 #[bench]
716 fn bench_decode(bencher: &mut test::Bencher) {
717 let s = testutils::ASCII_TEXT.as_bytes();
718 bencher.bytes = s.len() as u64;
719 bencher.iter(|| test::black_box(UTF8Encoding.decode(s, DecoderTrap::Strict)))
720 }
721
722 #[bench]
723 fn bench_from_utf8(bencher: &mut test::Bencher) {
724 let s = testutils::ASCII_TEXT.as_bytes();
725 bencher.bytes = s.len() as u64;
726 bencher.iter(|| test::black_box(from_utf8(s)))
727 }
728
729 #[bench] fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) {
731 let s = testutils::ASCII_TEXT.as_bytes();
732 bencher.bytes = s.len() as u64;
733 bencher.iter(|| test::black_box(str::from_utf8(s)))
734 }
735
736 #[bench] fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
738 let s = testutils::ASCII_TEXT.as_bytes();
739 bencher.bytes = s.len() as u64;
740 bencher.iter(|| test::black_box(String::from_utf8_lossy(s)))
741 }
742 }
743
744 mod bench_korean {
747 extern crate test;
748 use super::super::{from_utf8, UTF8Encoding};
749 use crate::testutils;
750 use crate::types::*;
751 use std::str;
752
753 #[bench]
754 fn bench_encode(bencher: &mut test::Bencher) {
755 let s = testutils::KOREAN_TEXT;
756 bencher.bytes = s.len() as u64;
757 bencher.iter(|| test::black_box(UTF8Encoding.encode(s, EncoderTrap::Strict)))
758 }
759
760 #[bench]
761 fn bench_decode(bencher: &mut test::Bencher) {
762 let s = testutils::KOREAN_TEXT.as_bytes();
763 bencher.bytes = s.len() as u64;
764 bencher.iter(|| test::black_box(UTF8Encoding.decode(s, DecoderTrap::Strict)))
765 }
766
767 #[bench]
768 fn bench_from_utf8(bencher: &mut test::Bencher) {
769 let s = testutils::KOREAN_TEXT.as_bytes();
770 bencher.bytes = s.len() as u64;
771 bencher.iter(|| test::black_box(from_utf8(s)))
772 }
773
774 #[bench] fn bench_stdlib_from_utf8(bencher: &mut test::Bencher) {
776 let s = testutils::KOREAN_TEXT.as_bytes();
777 bencher.bytes = s.len() as u64;
778 bencher.iter(|| test::black_box(str::from_utf8(s)))
779 }
780
781 #[bench] fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
783 let s = testutils::KOREAN_TEXT.as_bytes();
784 bencher.bytes = s.len() as u64;
785 bencher.iter(|| test::black_box(String::from_utf8_lossy(s)))
786 }
787 }
788
789 mod bench_lossy_invalid {
790 extern crate test;
791 use super::super::{from_utf8, UTF8Encoding};
792 use crate::testutils;
793 use crate::types::DecoderTrap::Replace as DecodeReplace;
794 use crate::types::*;
795 use std::str;
796
797 #[bench]
798 fn bench_decode_replace(bencher: &mut test::Bencher) {
799 let s = testutils::INVALID_UTF8_TEXT;
800 bencher.bytes = s.len() as u64;
801 bencher.iter(|| test::black_box(UTF8Encoding.decode(s, DecodeReplace)))
802 }
803
804 #[bench] fn bench_from_utf8_failing(bencher: &mut test::Bencher) {
806 let s = testutils::INVALID_UTF8_TEXT;
807 bencher.bytes = s.len() as u64;
808 bencher.iter(|| test::black_box(from_utf8(s)))
809 }
810
811 #[bench] fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) {
813 let s = testutils::INVALID_UTF8_TEXT;
814 bencher.bytes = s.len() as u64;
815 bencher.iter(|| test::black_box(str::from_utf8(s)))
816 }
817
818 #[bench] fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
820 let s = testutils::INVALID_UTF8_TEXT;
821 bencher.bytes = s.len() as u64;
822 bencher.iter(|| test::black_box(String::from_utf8_lossy(s)))
823 }
824 }
825
826 mod bench_lossy_external {
827 extern crate test;
828 use super::super::{from_utf8, UTF8Encoding};
829 use crate::testutils;
830 use crate::types::DecoderTrap::Replace as DecodeReplace;
831 use crate::types::*;
832 use std::str;
833
834 #[bench]
835 fn bench_decode_replace(bencher: &mut test::Bencher) {
836 let s = testutils::get_external_bench_data();
837 bencher.bytes = s.len() as u64;
838 bencher.iter(|| test::black_box(UTF8Encoding.decode(&s, DecodeReplace)))
839 }
840
841 #[bench] fn bench_from_utf8_failing(bencher: &mut test::Bencher) {
843 let s = testutils::get_external_bench_data();
844 bencher.bytes = s.len() as u64;
845 bencher.iter(|| test::black_box(from_utf8(&s)))
846 }
847
848 #[bench] fn bench_stdlib_from_utf8_failing(bencher: &mut test::Bencher) {
850 let s = testutils::get_external_bench_data();
851 bencher.bytes = s.len() as u64;
852 bencher.iter(|| test::black_box(str::from_utf8(&s)))
853 }
854
855 #[bench] fn bench_stdlib_from_utf8_lossy(bencher: &mut test::Bencher) {
857 let s = testutils::get_external_bench_data();
858 bencher.bytes = s.len() as u64;
859 bencher.iter(|| test::black_box(String::from_utf8_lossy(&s)))
860 }
861 }
862}