utf8_zero/lossy.rs
1use super::*;
2
3/// A push-based, lossy decoder for UTF-8.
4/// Errors are replaced with the U+FFFD replacement character.
5///
6/// Users "push" bytes into the decoder, which in turn "pushes" `&str` slices into a callback.
7///
8/// **Note:** Dropping the decoder signals the end of the input:
9/// If the last input chunk ended with an incomplete byte sequence for a code point,
10/// this is an error and a replacement character is emitted.
11/// Use `std::mem::forget` to inhibit this behavior.
12///
13/// # Examples
14///
15/// Single-shot lossy decoding (like `String::from_utf8_lossy` but returning `String`):
16///
17/// ```
18/// use utf8_zero::LossyDecoder;
19///
20/// fn string_from_utf8_lossy(input: &[u8]) -> String {
21/// let mut string = String::new();
22/// LossyDecoder::new(|s| string.push_str(s)).feed(input);
23/// string
24/// }
25///
26/// assert_eq!(string_from_utf8_lossy(b"Hello\xC0World"), "Hello\u{FFFD}World");
27/// ```
28///
29/// Streaming chunks -- a multi-byte code point split across two feeds:
30///
31/// ```
32/// use utf8_zero::LossyDecoder;
33///
34/// let mut output = String::new();
35/// {
36/// let mut decoder = LossyDecoder::new(|s| output.push_str(s));
37/// decoder.feed(b"Hello \xC3"); // first byte of U+00E9
38/// decoder.feed(b"\xA9!"); // second byte + excl mark
39/// }
40/// assert_eq!(output, "Hello \u{00E9}!");
41/// ```
42pub struct LossyDecoder<F: FnMut(&str)> {
43 push_str: F,
44 incomplete: Incomplete,
45}
46
47impl<F: FnMut(&str)> LossyDecoder<F> {
48 /// Create a new decoder from a callback.
49 #[inline]
50 pub fn new(push_str: F) -> Self {
51 LossyDecoder {
52 push_str,
53 incomplete: Incomplete {
54 buffer: [0, 0, 0, 0],
55 buffer_len: 0,
56 },
57 }
58 }
59
60 /// Feed one chunk of input into the decoder.
61 ///
62 /// The input is decoded lossily
63 /// and the callback called once or more with `&str` string slices.
64 ///
65 /// If the UTF-8 byte sequence for one code point was split into this bytes chunk
66 /// and previous bytes chunks, it will be correctly pieced back together.
67 pub fn feed(&mut self, mut input: &[u8]) {
68 if self.incomplete.buffer_len > 0 {
69 match self.incomplete.try_complete(input) {
70 Some((Ok(s), remaining)) => {
71 (self.push_str)(s);
72 input = remaining
73 }
74 Some((Err(_), remaining)) => {
75 (self.push_str)(REPLACEMENT_CHARACTER);
76 input = remaining
77 }
78 None => return,
79 }
80 }
81 loop {
82 match decode(input) {
83 Ok(s) => {
84 (self.push_str)(s);
85 return;
86 }
87 Err(DecodeError::Incomplete {
88 valid_prefix,
89 incomplete_suffix,
90 }) => {
91 (self.push_str)(valid_prefix);
92 self.incomplete = incomplete_suffix;
93 return;
94 }
95 Err(DecodeError::Invalid {
96 valid_prefix,
97 remaining_input,
98 ..
99 }) => {
100 (self.push_str)(valid_prefix);
101 (self.push_str)(REPLACEMENT_CHARACTER);
102 input = remaining_input
103 }
104 }
105 }
106 }
107}
108
109impl<F: FnMut(&str)> Drop for LossyDecoder<F> {
110 #[inline]
111 fn drop(&mut self) {
112 if self.incomplete.buffer_len > 0 {
113 (self.push_str)(REPLACEMENT_CHARACTER)
114 }
115 }
116}