Skip to main content

utf8_zero/
lossy.rs

1use super::*;
2
3/// A push-based, lossy decoder for UTF-8.
4/// Errors are replaced with the U+FFFD replacement character.
5///
6/// Users "push" bytes into the decoder, which in turn "pushes" `&str` slices into a callback.
7///
8/// **Note:** Dropping the decoder signals the end of the input:
9/// If the last input chunk ended with an incomplete byte sequence for a code point,
10/// this is an error and a replacement character is emitted.
11/// Use `std::mem::forget` to inhibit this behavior.
12///
13/// # Examples
14///
15/// Single-shot lossy decoding (like `String::from_utf8_lossy` but returning `String`):
16///
17/// ```
18/// use utf8_zero::LossyDecoder;
19///
20/// fn string_from_utf8_lossy(input: &[u8]) -> String {
21///     let mut string = String::new();
22///     LossyDecoder::new(|s| string.push_str(s)).feed(input);
23///     string
24/// }
25///
26/// assert_eq!(string_from_utf8_lossy(b"Hello\xC0World"), "Hello\u{FFFD}World");
27/// ```
28///
29/// Streaming chunks -- a multi-byte code point split across two feeds:
30///
31/// ```
32/// use utf8_zero::LossyDecoder;
33///
34/// let mut output = String::new();
35/// {
36///     let mut decoder = LossyDecoder::new(|s| output.push_str(s));
37///     decoder.feed(b"Hello \xC3");  // first byte of U+00E9
38///     decoder.feed(b"\xA9!");       // second byte + excl mark
39/// }
40/// assert_eq!(output, "Hello \u{00E9}!");
41/// ```
42pub struct LossyDecoder<F: FnMut(&str)> {
43    push_str: F,
44    incomplete: Incomplete,
45}
46
47impl<F: FnMut(&str)> LossyDecoder<F> {
48    /// Create a new decoder from a callback.
49    #[inline]
50    pub fn new(push_str: F) -> Self {
51        LossyDecoder {
52            push_str,
53            incomplete: Incomplete {
54                buffer: [0, 0, 0, 0],
55                buffer_len: 0,
56            },
57        }
58    }
59
60    /// Feed one chunk of input into the decoder.
61    ///
62    /// The input is decoded lossily
63    /// and the callback called once or more with `&str` string slices.
64    ///
65    /// If the UTF-8 byte sequence for one code point was split into this bytes chunk
66    /// and previous bytes chunks, it will be correctly pieced back together.
67    pub fn feed(&mut self, mut input: &[u8]) {
68        if self.incomplete.buffer_len > 0 {
69            match self.incomplete.try_complete(input) {
70                Some((Ok(s), remaining)) => {
71                    (self.push_str)(s);
72                    input = remaining
73                }
74                Some((Err(_), remaining)) => {
75                    (self.push_str)(REPLACEMENT_CHARACTER);
76                    input = remaining
77                }
78                None => return,
79            }
80        }
81        loop {
82            match decode(input) {
83                Ok(s) => {
84                    (self.push_str)(s);
85                    return;
86                }
87                Err(DecodeError::Incomplete {
88                    valid_prefix,
89                    incomplete_suffix,
90                }) => {
91                    (self.push_str)(valid_prefix);
92                    self.incomplete = incomplete_suffix;
93                    return;
94                }
95                Err(DecodeError::Invalid {
96                    valid_prefix,
97                    remaining_input,
98                    ..
99                }) => {
100                    (self.push_str)(valid_prefix);
101                    (self.push_str)(REPLACEMENT_CHARACTER);
102                    input = remaining_input
103                }
104            }
105        }
106    }
107}
108
109impl<F: FnMut(&str)> Drop for LossyDecoder<F> {
110    #[inline]
111    fn drop(&mut self) {
112        if self.incomplete.buffer_len > 0 {
113            (self.push_str)(REPLACEMENT_CHARACTER)
114        }
115    }
116}