Skip to main content

quick_xml/
encoding.rs

1//! A module for wrappers that encode / decode data.
2
3use std::borrow::Cow;
4use std::str::Utf8Error;
5
6#[cfg(feature = "encoding")]
7use encoding_rs;
8#[cfg(feature = "encoding")]
9use std::io::{self, BufRead, Read};
10
11/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
12/// See <https://unicode.org/faq/utf_bom.html#bom1>
13pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
14/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with little-endian byte order.
15/// See <https://unicode.org/faq/utf_bom.html#bom1>
16pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
17/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with big-endian byte order.
18/// See <https://unicode.org/faq/utf_bom.html#bom1>
19pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
20
21/// An error when decoding or encoding
22///
23/// If feature [`encoding`] is disabled, the [`EncodingError`] is always [`EncodingError::Utf8`]
24///
25/// [`encoding`]: ../index.html#encoding
26#[derive(Clone, Debug, PartialEq, Eq)]
27#[non_exhaustive]
28pub enum EncodingError {
29    /// Input was not valid UTF-8
30    Utf8(Utf8Error),
31    /// Input did not adhere to the given encoding
32    #[cfg(feature = "encoding")]
33    Other(&'static encoding_rs::Encoding),
34}
35
36impl From<Utf8Error> for EncodingError {
37    #[inline]
38    fn from(e: Utf8Error) -> Self {
39        Self::Utf8(e)
40    }
41}
42
43impl std::error::Error for EncodingError {
44    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
45        match self {
46            Self::Utf8(e) => Some(e),
47            #[cfg(feature = "encoding")]
48            Self::Other(_) => None,
49        }
50    }
51}
52
53impl std::fmt::Display for EncodingError {
54    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55        match self {
56            Self::Utf8(e) => write!(f, "cannot decode input using UTF-8: {}", e),
57            #[cfg(feature = "encoding")]
58            Self::Other(encoding) => write!(f, "cannot decode input using {}", encoding.name()),
59        }
60    }
61}
62
63/// Decoder of byte slices into strings.
64///
65/// If feature [`encoding`] is enabled, this encoding taken from the `"encoding"`
66/// XML declaration or assumes UTF-8, if XML has no <?xml ?> declaration, encoding
67/// key is not defined or contains unknown encoding.
68///
69/// The library supports any UTF-8 compatible encodings that crate `encoding_rs`
70/// is supported. [*UTF-16 and ISO-2022-JP are not supported at the present*][utf16].
71///
72/// If feature [`encoding`] is disabled, the decoder is always UTF-8 decoder:
73/// any XML declarations are ignored.
74///
75/// [utf16]: https://github.com/tafia/quick-xml/issues/158
76/// [`encoding`]: ../index.html#encoding
77#[derive(Clone, Copy, Debug, Eq, PartialEq)]
78pub struct Decoder {
79    #[cfg(feature = "encoding")]
80    pub(crate) encoding: &'static encoding_rs::Encoding,
81}
82
83impl Decoder {
84    pub(crate) const fn utf8() -> Self {
85        Decoder {
86            #[cfg(feature = "encoding")]
87            encoding: encoding_rs::UTF_8,
88        }
89    }
90
91    #[cfg(all(test, feature = "encoding", feature = "serialize"))]
92    pub(crate) const fn utf16() -> Self {
93        Decoder {
94            encoding: encoding_rs::UTF_16LE,
95        }
96    }
97}
98
99impl Decoder {
100    /// Returns the `Reader`s encoding.
101    ///
102    /// This encoding will be used by [`decode`].
103    ///
104    /// [`decode`]: Self::decode
105    #[cfg(feature = "encoding")]
106    pub const fn encoding(&self) -> &'static encoding_rs::Encoding {
107        self.encoding
108    }
109
110    /// ## Without `encoding` feature
111    ///
112    /// Decodes an UTF-8 slice regardless of XML declaration and ignoring BOM
113    /// if it is present in the `bytes`.
114    ///
115    /// ## With `encoding` feature
116    ///
117    /// Decodes specified bytes using encoding, declared in the XML, if it was
118    /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
119    /// in the `bytes`.
120    ///
121    /// ----
122    /// Returns an error in case of malformed sequences in the `bytes`.
123    pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>, EncodingError> {
124        #[cfg(not(feature = "encoding"))]
125        let decoded = Ok(Cow::Borrowed(std::str::from_utf8(bytes)?));
126
127        #[cfg(feature = "encoding")]
128        let decoded = decode(bytes, self.encoding);
129
130        decoded
131    }
132
133    /// Like [`decode`][Self::decode] but using a pre-allocated buffer.
134    pub fn decode_into(&self, bytes: &[u8], buf: &mut String) -> Result<(), EncodingError> {
135        #[cfg(not(feature = "encoding"))]
136        buf.push_str(std::str::from_utf8(bytes)?);
137
138        #[cfg(feature = "encoding")]
139        decode_into(bytes, self.encoding, buf)?;
140
141        Ok(())
142    }
143
144    /// Decodes the `Cow` buffer, preserves the lifetime
145    pub(crate) fn decode_cow<'b>(
146        &self,
147        bytes: &Cow<'b, [u8]>,
148    ) -> Result<Cow<'b, str>, EncodingError> {
149        match bytes {
150            Cow::Borrowed(bytes) => self.decode(bytes),
151            // Convert to owned, because otherwise Cow will be bound with wrong lifetime
152            Cow::Owned(bytes) => Ok(self.decode(bytes)?.into_owned().into()),
153        }
154    }
155
156    /// Decodes the `Cow` buffer, normalizes XML EOLs, preserves the lifetime
157    pub(crate) fn content<'b>(
158        &self,
159        bytes: &Cow<'b, [u8]>,
160        normalize_eol: impl Fn(&str) -> Cow<str>,
161    ) -> Result<Cow<'b, str>, EncodingError> {
162        match bytes {
163            Cow::Borrowed(bytes) => {
164                let text = self.decode(bytes)?;
165                match normalize_eol(&text) {
166                    // If text borrowed after normalization that means that it's not changed
167                    Cow::Borrowed(_) => Ok(text),
168                    Cow::Owned(s) => Ok(Cow::Owned(s)),
169                }
170            }
171            Cow::Owned(bytes) => {
172                let text = self.decode(bytes)?;
173                let text = normalize_eol(&text);
174                // Convert to owned, because otherwise Cow will be bound with wrong lifetime
175                Ok(text.into_owned().into())
176            }
177        }
178    }
179}
180
181/// Decodes the provided bytes using the specified encoding.
182///
183/// Returns an error in case of malformed or non-representable sequences in the `bytes`.
184#[cfg(feature = "encoding")]
185pub fn decode<'b>(
186    bytes: &'b [u8],
187    encoding: &'static encoding_rs::Encoding,
188) -> Result<Cow<'b, str>, EncodingError> {
189    encoding
190        .decode_without_bom_handling_and_without_replacement(bytes)
191        .ok_or(EncodingError::Other(encoding))
192}
193
194/// Like [`decode`] but using a pre-allocated buffer.
195#[cfg(feature = "encoding")]
196pub fn decode_into(
197    bytes: &[u8],
198    encoding: &'static encoding_rs::Encoding,
199    buf: &mut String,
200) -> Result<(), EncodingError> {
201    if encoding == encoding_rs::UTF_8 {
202        buf.push_str(std::str::from_utf8(bytes)?);
203        return Ok(());
204    }
205
206    let mut decoder = encoding.new_decoder_without_bom_handling();
207    buf.reserve(
208        decoder
209            .max_utf8_buffer_length_without_replacement(bytes.len())
210            // SAFETY: None can be returned only if required size will overflow usize,
211            // but in that case String::reserve also panics
212            .unwrap(),
213    );
214    let (result, read) = decoder.decode_to_string_without_replacement(bytes, buf, true);
215    match result {
216        encoding_rs::DecoderResult::InputEmpty => {
217            debug_assert_eq!(read, bytes.len());
218            Ok(())
219        }
220        encoding_rs::DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)),
221        // SAFETY: We allocate enough space above
222        encoding_rs::DecoderResult::OutputFull => unreachable!(),
223    }
224}
225
226/// Automatic encoding detection of XML files based using the
227/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
228///
229/// If encoding is detected, `Some` is returned with a [`DetectedEncoding`] that provides
230/// the BOM size in bytes (or zero if no BOM was present).
231///
232/// IF encoding was not recognized, `None` is returned.
233///
234/// Because the [`encoding_rs`] crate supports only subset of those encodings, only
235/// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
236///
237/// The algorithm suggests examine up to the first 4 bytes to determine encoding
238/// according to the following table:
239///
240/// | Bytes       |Detected encoding
241/// |-------------|------------------------------------------
242/// | **BOM**
243/// |`FE_FF_##_##`|UTF-16, big-endian
244/// |`FF FE ## ##`|UTF-16, little-endian
245/// |`EF BB BF`   |UTF-8
246/// | **No BOM**
247/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
248/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
249/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
250pub fn detect_encoding(bytes: &[u8]) -> Option<DetectedEncoding> {
251    // Prevent suggesting "<?xm". We want to have the same formatted lines for all arms.
252    #[allow(clippy::byte_char_slices)]
253    match bytes {
254        // with BOM
255        _ if bytes.starts_with(UTF16_BE_BOM) => Some(DetectedEncoding::Utf16BeBom),
256        _ if bytes.starts_with(UTF16_LE_BOM) => Some(DetectedEncoding::Utf16LeBom),
257        _ if bytes.starts_with(UTF8_BOM) => Some(DetectedEncoding::Utf8Bom),
258
259        // without BOM
260        _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(DetectedEncoding::Utf16BeLike), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
261        _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(DetectedEncoding::Utf16LeLike), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
262        _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => {
263            Some(DetectedEncoding::AsciiCompatible)
264        } // Some ASCII compatible
265
266        _ => None,
267    }
268}
269
270/// Possible scenarios for start-of-xml detection of encoding
271///
272/// See the documentation of [`detect_encoding`]
273pub enum DetectedEncoding {
274    /// Matches UTF-8 or some other ascii-compatible encoding
275    AsciiCompatible,
276    /// We saw a UTF-8 BOM
277    Utf8Bom,
278    /// Matches UTF-16-LE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2)
279    Utf16LeLike,
280    /// We saw a UTF-16 BOM in little-endian orientation
281    Utf16LeBom,
282    /// Matches UTF-16-BE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2)
283    Utf16BeLike,
284    /// We saw a UTF-16 BOM in big-endian orientation
285    Utf16BeBom,
286}
287
288impl DetectedEncoding {
289    /// Return an Encoding object appropriate for the detected encoding
290    #[cfg(feature = "encoding")]
291    pub const fn encoding(&self) -> &'static encoding_rs::Encoding {
292        match self {
293            DetectedEncoding::AsciiCompatible | DetectedEncoding::Utf8Bom => encoding_rs::UTF_8,
294            DetectedEncoding::Utf16LeLike | DetectedEncoding::Utf16LeBom => encoding_rs::UTF_16LE,
295            DetectedEncoding::Utf16BeLike | DetectedEncoding::Utf16BeBom => encoding_rs::UTF_16BE,
296        }
297    }
298
299    /// Length of the BOM, which may need to be stripped from the input
300    pub const fn bom_len(&self) -> usize {
301        match self {
302            DetectedEncoding::Utf8Bom => 3,
303            DetectedEncoding::Utf16LeBom | DetectedEncoding::Utf16BeBom => 2,
304            DetectedEncoding::AsciiCompatible
305            | DetectedEncoding::Utf16LeLike
306            | DetectedEncoding::Utf16BeLike => 0,
307        }
308    }
309}
310
311// Bytes read upfront so `set_encoding()` can be called before the main
312// decode loop. Kept small (just enough for an XML declaration) to limit
313// bytes decoded with a potentially wrong initial encoding.
314#[cfg(feature = "encoding")]
315const PREFIX_CAP: usize = 64;
316
317#[cfg(feature = "encoding")]
318struct Prefix {
319    buf: [u8; PREFIX_CAP],
320    len: usize,
321    detected: bool,
322}
323
324/// A reader wrapper that decodes a byte stream from any encoding into UTF-8.
325///
326/// This reader wraps a [`BufRead`] source and uses [`encoding_rs::Decoder`] to
327/// transcode the input into valid UTF-8. On first access, it detects the encoding
328/// from BOM or XML declaration byte patterns and configures the appropriate decoder.
329///
330/// For UTF-8 input, this acts as a validating passthrough. For UTF-16 or other
331/// encodings, the bytes are transcoded into UTF-8 in an internal buffer.
332///
333/// # Examples
334///
335/// ```
336/// use std::io::Read;
337/// use quick_xml::encoding::DecodingReader;
338///
339/// // UTF-8 input passes through:
340/// let data = b"Hello, World!";
341/// let mut reader = DecodingReader::new(&data[..]);
342/// let mut buf = Vec::new();
343/// reader.read_to_end(&mut buf).unwrap();
344/// assert_eq!(buf, data);
345/// ```
346///
347/// The example below shows how you can read documents using `DecodingReader`:
348/// ```
349/// use quick_xml::encoding::DecodingReader;
350/// use quick_xml::events::Event;
351/// use quick_xml::reader::Reader;
352///
353/// # fn to_utf16le_with_bom(string: &str) -> Vec<u8> {
354/// #     let mut bytes = Vec::new();
355/// #     bytes.extend_from_slice(&[0xFF, 0xFE]); // UTF-16 LE BOM
356/// #     for ch in string.encode_utf16() {
357/// #         bytes.extend_from_slice(&ch.to_le_bytes());
358/// #     }
359/// #     bytes
360/// # }
361/// let xml = to_utf16le_with_bom("<?xml encoding='UTF-16'?><element/>");
362/// let mut decoder = DecodingReader::new(xml.as_ref());
363/// let mut reader = Reader::from_reader(decoder);
364///
365/// let mut buf = Vec::new();
366/// loop {
367///     buf.clear();
368///     match reader.read_event_into(&mut buf).unwrap() {
369///         Event::Decl(e) => {
370///             // If XML declaration contains unknown encoding name, None is returned
371///             match e.encoder() {
372///                 Some(encoding) => reader.get_mut().set_encoding(encoding),
373///                 None => panic!("Unsupported encoding {:?}", e.encoding()),
374///             }
375///         }
376///         Event::Eof => break,
377///         _ => {}
378///     }
379/// }
380/// ```
381#[cfg(feature = "encoding")]
382pub struct DecodingReader<R> {
383    inner: R,
384    decoder: encoding_rs::Decoder,
385    /// `encoding_rs::Decoder` panics if called after finalization (`last=true`).
386    /// This flag prevents that by short-circuiting `fill_buf` after completion.
387    decoder_finished: bool,
388    /// Decoded UTF-8 output buffer
389    out_buf: Box<[u8]>,
390    /// Start of unconsumed data in out_buf
391    out_pos: usize,
392    /// End of valid data in out_buf
393    out_len: usize,
394    /// Bytes read upfront for encoding detection and XML declaration buffering.
395    /// `Some` until the prefix is fully drained; `None` afterward (main decode
396    /// path takes over and the allocation is freed).
397    prefix: Option<Box<Prefix>>,
398    /// Whether the inner reader has reached EOF
399    inner_eof: bool,
400}
401
402#[cfg(feature = "encoding")]
403impl<R: std::fmt::Debug> std::fmt::Debug for DecodingReader<R> {
404    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
405        f.debug_struct("DecodingReader")
406            .field("inner", &self.inner)
407            .field("encoding", &self.decoder.encoding())
408            .field("out_pos", &self.out_pos)
409            .field("out_len", &self.out_len)
410            .field("inner_eof", &self.inner_eof)
411            .field("prefix_active", &self.prefix.is_some())
412            .finish()
413    }
414}
415
416#[cfg(feature = "encoding")]
417impl<R> DecodingReader<R> {
418    /// Creates a new decoding reader.
419    ///
420    /// The encoding is auto-detected from BOM or XML declaration patterns on
421    /// first access. Defaults to UTF-8 if no pattern is recognized.
422    pub fn new(inner: R) -> Self {
423        Self {
424            inner,
425            decoder: encoding_rs::UTF_8.new_decoder_without_bom_handling(),
426            decoder_finished: false,
427            out_buf: vec![0u8; 8192].into_boxed_slice(),
428            out_pos: 0,
429            out_len: 0,
430            prefix: Some(Box::new(Prefix {
431                buf: [0; PREFIX_CAP],
432                len: 0,
433                detected: false,
434            })),
435            inner_eof: false,
436        }
437    }
438
439    /// Returns a reference to the underlying reader
440    pub const fn get_ref(&self) -> &R {
441        &self.inner
442    }
443
444    /// Returns a mutable reference to the underlying reader
445    pub const fn get_mut(&mut self) -> &mut R {
446        &mut self.inner
447    }
448
449    /// Consumes this reader and returns the underlying reader
450    pub fn into_inner(self) -> R {
451        self.inner
452    }
453
454    /// Returns the encoding currently used by the decoder.
455    ///
456    /// Before the first read, this is always UTF-8. After encoding detection
457    /// it reflects the detected (or overridden) encoding.
458    pub fn encoding(&self) -> &'static encoding_rs::Encoding {
459        self.decoder.encoding()
460    }
461
462    /// Replaces the decoder with one for the given encoding. The encoding
463    /// must be ASCII-compatible (the parser cannot read the declaration otherwise).
464    ///
465    /// # Panics
466    ///
467    /// Panics if the prefix buffer has already been drained. Must be called
468    /// before the prefix is exhausted — in practice, right after parsing
469    /// the XML declaration.
470    pub fn set_encoding(&mut self, encoding: &'static encoding_rs::Encoding) {
471        // No-op when the encoding matches - replacing the decoder would discard
472        // its internal state (e.g. a partial multi-byte sequence), corrupting output.
473        // This check is safe regardless of prefix state since nothing changes.
474        if self.decoder.encoding() == encoding {
475            return;
476        }
477        assert!(
478            self.prefix.is_some(),
479            "set_encoding() called after prefix buffer was drained; \
480             encoding can only be changed while the prefix is still active"
481        );
482        self.decoder = encoding.new_decoder_without_bom_handling();
483        self.decoder_finished = false;
484    }
485}
486
487#[cfg(feature = "encoding")]
488impl<R: BufRead> BufRead for DecodingReader<R> {
489    fn fill_buf(&mut self) -> io::Result<&[u8]> {
490        // Fast path: serve already-decoded data
491        if self.out_pos < self.out_len {
492            return Ok(&self.out_buf[self.out_pos..self.out_len]);
493        }
494
495        // Reset output buffer
496        self.out_pos = 0;
497        self.out_len = 0;
498
499        if let Some(prefix) = &mut self.prefix {
500            // On first access, fill the prefix buffer and detect encoding.
501            // The prefix is large enough to hold an entire XML declaration,
502            // ensuring set_encoding() can be called before the greedy main
503            // decode path consumes from inner.
504            if !prefix.detected {
505                prefix.detected = true;
506
507                while prefix.len < PREFIX_CAP {
508                    match self.inner.read(&mut prefix.buf[prefix.len..]) {
509                        Ok(0) => {
510                            self.inner_eof = true;
511                            break;
512                        }
513                        Ok(n) => prefix.len += n,
514                        Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
515                        Err(e) => return Err(e),
516                    }
517                }
518
519                let detection_bytes = &prefix.buf[..prefix.len];
520                if let Some(detected) = detect_encoding(detection_bytes) {
521                    let bom_len = detected.bom_len();
522                    if bom_len > 0 {
523                        prefix.buf.copy_within(bom_len..prefix.len, 0);
524                        prefix.len -= bom_len;
525                    }
526                    let encoding = detected.encoding();
527                    if encoding != encoding_rs::UTF_8 {
528                        self.decoder = encoding.new_decoder_without_bom_handling();
529                    }
530                }
531            }
532
533            if self.decoder_finished {
534                return Ok(&[]);
535            }
536
537            // Prefix fully decoded on a previous call - drop it and fall
538            // through to the main decode path.
539            if prefix.len == 0 {
540                self.prefix = None;
541            } else {
542                // Decode from prefix buffer
543                let src = &prefix.buf[..prefix.len];
544                let (result, read, written) = self.decoder.decode_to_utf8_without_replacement(
545                    src,
546                    &mut self.out_buf[..],
547                    false,
548                );
549                prefix.buf.copy_within(read..prefix.len, 0);
550                prefix.len -= read;
551                self.out_len = written;
552
553                match result {
554                    encoding_rs::DecoderResult::InputEmpty if written > 0 => {
555                        return Ok(&self.out_buf[..self.out_len]);
556                    }
557                    encoding_rs::DecoderResult::InputEmpty => {
558                        // prefix.len is now 0; keep prefix alive for
559                        // set_encoding() - it will be dropped on the next call.
560                    }
561                    encoding_rs::DecoderResult::OutputFull => {
562                        return Ok(&self.out_buf[..self.out_len]);
563                    }
564                    encoding_rs::DecoderResult::Malformed(_, _) => {
565                        return Err(io::Error::new(
566                            io::ErrorKind::InvalidData,
567                            EncodingError::Other(self.decoder.encoding()),
568                        ));
569                    }
570                }
571                // InputEmpty with written == 0: prefix drained, decoder may
572                // hold partial internal state (e.g. a lone byte of UTF-16).
573                // Drop prefix and fall through to the main decode path.
574                if prefix.len == 0 {
575                    self.prefix = None;
576                }
577            }
578        }
579
580        if self.decoder_finished {
581            return Ok(&[]);
582        }
583
584        // Loop until we produce output, hit EOF, or get an error.
585        // The decoder may consume input into internal state (e.g., partial
586        // UTF-16 code unit) without producing output - we must keep feeding
587        // it more input rather than returning an empty slice (which signals EOF).
588        loop {
589            // EOF flush path: tell decoder this is the last chunk
590            if self.inner_eof {
591                let (result, _, written) = self.decoder.decode_to_utf8_without_replacement(
592                    b"",
593                    &mut self.out_buf[..],
594                    true,
595                );
596                self.out_len = written;
597                match result {
598                    encoding_rs::DecoderResult::InputEmpty => {
599                        self.decoder_finished = true;
600                        return Ok(&self.out_buf[..self.out_len]);
601                    }
602                    encoding_rs::DecoderResult::OutputFull => {
603                        return Ok(&self.out_buf[..self.out_len]);
604                    }
605                    encoding_rs::DecoderResult::Malformed(_, _) => {
606                        return Err(io::Error::new(
607                            io::ErrorKind::InvalidData,
608                            EncodingError::Other(self.decoder.encoding()),
609                        ));
610                    }
611                }
612            }
613
614            // Main decode path: read from inner, decode into out_buf
615            let (result, read, written) = {
616                let src = self.inner.fill_buf()?;
617                if src.is_empty() {
618                    self.inner_eof = true;
619                    continue; // will hit EOF flush path on next iteration
620                }
621                self.decoder
622                    .decode_to_utf8_without_replacement(src, &mut self.out_buf[..], false)
623            };
624            self.inner.consume(read);
625            self.out_len = written;
626
627            match result {
628                encoding_rs::DecoderResult::InputEmpty if written > 0 => {
629                    return Ok(&self.out_buf[..self.out_len]);
630                }
631                encoding_rs::DecoderResult::InputEmpty => {
632                    // Decoder consumed all input but produced no output
633                    // (e.g., 1 byte of a 2-byte UTF-16 code unit stored
634                    // in decoder internal state). Loop to get more input.
635                }
636                encoding_rs::DecoderResult::OutputFull => {
637                    // Output buffer full; return what we have. Remaining
638                    // input will be decoded on the next fill_buf call.
639                    return Ok(&self.out_buf[..self.out_len]);
640                }
641                encoding_rs::DecoderResult::Malformed(_, _) => {
642                    return Err(io::Error::new(
643                        io::ErrorKind::InvalidData,
644                        EncodingError::Other(self.decoder.encoding()),
645                    ));
646                }
647            }
648        }
649    }
650
651    fn consume(&mut self, amt: usize) {
652        debug_assert!(
653            self.out_pos + amt <= self.out_len,
654            "consume({amt}) out of range: out_pos={}, out_len={}",
655            self.out_pos,
656            self.out_len,
657        );
658        self.out_pos += amt;
659    }
660}
661
662#[cfg(feature = "encoding")]
663impl<R: BufRead> Read for DecodingReader<R> {
664    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
665        if buf.is_empty() {
666            return Ok(0);
667        }
668        let available = self.fill_buf()?;
669        if available.is_empty() {
670            return Ok(0);
671        }
672        let len = available.len().min(buf.len());
673        buf[..len].copy_from_slice(&available[..len]);
674        self.consume(len);
675        Ok(len)
676    }
677}
678
679#[cfg(all(test, feature = "encoding"))]
680mod decoding_reader {
681    use super::*;
682    use std::io::{BufReader, Read};
683
684    /// Helper reader that returns data in fixed-size chunks
685    struct ChunkedReader<'a> {
686        data: &'a [u8],
687        pos: usize,
688        chunk_size: usize,
689    }
690
691    impl<'a> ChunkedReader<'a> {
692        fn new(data: &'a [u8], chunk_size: usize) -> Self {
693            Self {
694                data,
695                pos: 0,
696                chunk_size,
697            }
698        }
699    }
700
701    impl<'a> Read for ChunkedReader<'a> {
702        fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
703            if self.pos >= self.data.len() {
704                return Ok(0);
705            }
706            let len = self
707                .chunk_size
708                .min(buf.len())
709                .min(self.data.len() - self.pos);
710            buf[..len].copy_from_slice(&self.data[self.pos..self.pos + len]);
711            self.pos += len;
712            Ok(len)
713        }
714    }
715
716    /// Encode a string as UTF-16 LE bytes with BOM
717    fn utf16le_with_bom(s: &str) -> Vec<u8> {
718        let mut out = vec![0xFF, 0xFE]; // UTF-16 LE BOM
719        for code_unit in s.encode_utf16() {
720            out.extend_from_slice(&code_unit.to_le_bytes());
721        }
722        out
723    }
724
725    /// Encode a string as UTF-16 BE bytes with BOM
726    fn utf16be_with_bom(s: &str) -> Vec<u8> {
727        let mut out = vec![0xFE, 0xFF]; // UTF-16 BE BOM
728        for code_unit in s.encode_utf16() {
729            out.extend_from_slice(&code_unit.to_be_bytes());
730        }
731        out
732    }
733
734    /// Encode a string as UTF-16 LE bytes without BOM
735    fn utf16le_no_bom(s: &str) -> Vec<u8> {
736        let mut out = Vec::new();
737        for code_unit in s.encode_utf16() {
738            out.extend_from_slice(&code_unit.to_le_bytes());
739        }
740        out
741    }
742
743    /// Encode a string as UTF-16 BE bytes without BOM
744    fn utf16be_no_bom(s: &str) -> Vec<u8> {
745        let mut out = Vec::new();
746        for code_unit in s.encode_utf16() {
747            out.extend_from_slice(&code_unit.to_be_bytes());
748        }
749        out
750    }
751
752    /// Read all bytes from a reader into a String
753    fn read_all(reader: &mut DecodingReader<impl BufRead>) -> io::Result<String> {
754        let mut result = Vec::new();
755        reader.read_to_end(&mut result)?;
756        Ok(String::from_utf8(result).expect("DecodingReader should produce valid UTF-8"))
757    }
758
759    /// Simple edge cases and degenerate inputs
760    mod edge_cases {
761        use super::*;
762        use pretty_assertions::assert_eq;
763
764        /// Zero-length input should immediately return EOF (n == 0).
765        #[test]
766        fn empty_input() {
767            let data = b"";
768            let mut reader = DecodingReader::new(&data[..]);
769            let mut buf = [0u8; 10];
770            let n = reader.read(&mut buf).unwrap();
771            assert_eq!(n, 0);
772        }
773
774        /// A UTF-8 BOM with no payload should decode to an empty string.
775        #[test]
776        fn utf8_bom_only() {
777            let data = b"\xEF\xBB\xBF";
778            let mut reader = DecodingReader::new(&data[..]);
779            assert_eq!(read_all(&mut reader).unwrap(), "");
780        }
781
782        /// A UTF-16 LE BOM with no payload should decode to an empty string.
783        #[test]
784        fn utf16le_bom_only() {
785            let data = &[0xFF, 0xFE];
786            let mut reader = DecodingReader::new(&data[..]);
787            assert_eq!(read_all(&mut reader).unwrap(), "");
788        }
789
790        /// A UTF-16 BE BOM with no payload should decode to an empty string.
791        #[test]
792        fn utf16be_bom_only() {
793            let data = &[0xFE, 0xFF];
794            let mut reader = DecodingReader::new(&data[..]);
795            assert_eq!(read_all(&mut reader).unwrap(), "");
796        }
797
798        /// Invalid UTF-8 (no BOM, so treated as UTF-8) must produce an error.
799        #[test]
800        fn invalid_utf8_is_rejected() {
801            let data: &[u8] = &[0x48, 0x65, 0x6C, 0xFF, 0xFE];
802            let mut reader = DecodingReader::new(&data[..]);
803            let err = read_all(&mut reader).unwrap_err();
804            assert_eq!(err.kind(), io::ErrorKind::InvalidData);
805        }
806
807        /// An odd trailing byte in UTF-16 is malformed and must produce an error.
808        #[test]
809        fn truncated_utf16_at_eof() {
810            // UTF-16 LE BOM + one valid code unit + one incomplete byte
811            let data: &[u8] = &[0xFF, 0xFE, 0x48, 0x00, 0x65];
812            let mut reader = DecodingReader::new(&data[..]);
813            let err = read_all(&mut reader).unwrap_err();
814            assert_eq!(err.kind(), io::ErrorKind::InvalidData);
815        }
816
817        /// A 1-byte output buffer forces one byte per read() call; verifies
818        /// multi-byte UTF-8 sequences are still assembled correctly.
819        #[test]
820        fn read_with_one_byte_buffer() {
821            let data = "Hello, 世界!".as_bytes();
822            let mut reader = DecodingReader::new(&data[..]);
823            let mut result = Vec::new();
824            let mut buf = [0u8; 1];
825            loop {
826                let n = reader.read(&mut buf).unwrap();
827                if n == 0 {
828                    break;
829                }
830                result.extend_from_slice(&buf[..n]);
831            }
832            assert_eq!(String::from_utf8(result).unwrap(), "Hello, 世界!");
833        }
834    }
835
836    /// Tests that exercise the BufRead contract (fill_buf + consume) directly,
837    /// as opposed to the Read-based helpers used elsewhere.
838    mod bufread_interface {
839        use super::*;
840        use pretty_assertions::assert_eq;
841        use std::io::BufRead;
842
843        /// Basic fill_buf/consume cycle: partial consume leaves remaining
844        /// data available on the next fill_buf call.
845        #[test]
846        fn fill_buf_and_consume() {
847            let data = b"Hello, World!";
848            let mut reader = DecodingReader::new(&data[..]);
849
850            let buf = reader.fill_buf().unwrap();
851            assert!(!buf.is_empty());
852            assert_eq!(buf[0], b'H');
853
854            // Consume only part of the buffer
855            reader.consume(5);
856
857            let buf = reader.fill_buf().unwrap();
858            assert!(!buf.is_empty());
859            assert_eq!(buf[0], b',');
860        }
861
862        /// Drain the reader via fill_buf/consume, then confirm it stays at EOF.
863        #[test]
864        fn partial_consume_then_read_more() {
865            let data = b"Hello, World!";
866            let mut reader = DecodingReader::new(&data[..]);
867
868            // Collect all output via fill_buf/consume
869            let mut result = Vec::new();
870            loop {
871                let buf = reader.fill_buf().unwrap();
872                if buf.is_empty() {
873                    break;
874                }
875                result.extend_from_slice(buf);
876                let len = buf.len();
877                reader.consume(len);
878            }
879            assert_eq!(std::str::from_utf8(&result).unwrap(), "Hello, World!");
880
881            // Should remain at EOF
882            let buf = reader.fill_buf().unwrap();
883            assert!(buf.is_empty());
884        }
885
886        /// Calling fill_buf() repeatedly after EOF must keep returning empty
887        /// (and not panic - encoding_rs::Decoder panics if called after finalization).
888        #[test]
889        fn fill_buf_after_eof_is_idempotent() {
890            let data = b"Hello";
891            let mut reader = DecodingReader::new(&data[..]);
892
893            loop {
894                let buf = reader.fill_buf().unwrap();
895                if buf.is_empty() {
896                    break;
897                }
898                let len = buf.len();
899                reader.consume(len);
900            }
901
902            for _ in 0..3 {
903                let buf = reader.fill_buf().unwrap();
904                assert!(buf.is_empty());
905            }
906        }
907
908        /// consume() past the buffered length must trigger a debug_assert panic.
909        #[test]
910        #[should_panic(expected = "consume")]
911        fn consume_overflow_panics_in_debug() {
912            let data = b"Hi";
913            let mut reader = DecodingReader::new(&data[..]);
914            let _ = reader.fill_buf().unwrap();
915            reader.consume(100);
916        }
917    }
918
919    mod accessors {
920        use super::*;
921        use pretty_assertions::assert_eq;
922        use std::io::Cursor;
923
924        #[test]
925        fn get_ref() {
926            let data = b"Hello";
927            let cursor = Cursor::new(data.to_vec());
928            let reader = DecodingReader::new(cursor);
929            assert_eq!(reader.get_ref().get_ref(), data);
930        }
931
932        #[test]
933        fn get_mut() {
934            let data = b"Hello";
935            let cursor = Cursor::new(data.to_vec());
936            let mut reader = DecodingReader::new(cursor);
937            reader.get_mut().set_position(2);
938            assert_eq!(reader.get_ref().position(), 2);
939        }
940
941        #[test]
942        fn into_inner() {
943            let data = b"Hello";
944            let cursor = Cursor::new(data.to_vec());
945            let reader = DecodingReader::new(cursor);
946            let inner = reader.into_inner();
947            assert_eq!(inner.get_ref(), data);
948        }
949
950        /// Default encoding before any reads is UTF-8.
951        #[test]
952        fn encoding_default_is_utf8() {
953            let reader = DecodingReader::new(&b"Hello"[..]);
954            assert_eq!(reader.encoding(), encoding_rs::UTF_8);
955        }
956    }
957
958    // TODO: These tests emulate the updating of the internal decoder after reading the XML decl.
959    // Since `Reader` currently only speaks the `BufRead` trait, we can't test that directly.
960    // Eventually once `Reader` knows about the underlying `DecodingReader` we should test
961    // that directly.
962
963    /// Tests for encoding() and set_encoding(): detection, switching,
964    /// same-encoding no-op safety, and mid-stream override behavior.
965    mod encoding_switching {
966        use super::*;
967        use pretty_assertions::assert_eq;
968        use std::io::BufRead;
969
970        /// Encoding reflects BOM detection after first read.
971        #[test]
972        fn encoding_reflects_detection() {
973            let data = utf16le_with_bom("Hello");
974            let mut reader = DecodingReader::new(&data[..]);
975            let _ = read_all(&mut reader).unwrap();
976            assert_eq!(reader.encoding(), encoding_rs::UTF_16LE);
977        }
978
979        /// set_encoding switches the active decoder.
980        #[test]
981        fn set_encoding_changes_encoding() {
982            let mut reader = DecodingReader::new(&b"Hello"[..]);
983            assert_eq!(reader.encoding(), encoding_rs::UTF_8);
984            reader.set_encoding(encoding_rs::UTF_16LE);
985            assert_eq!(reader.encoding(), encoding_rs::UTF_16LE);
986        }
987
988        /// set_encoding after reading preserves already-buffered output.
989        #[test]
990        fn set_encoding_preserves_buffered_output() {
991            let data = b"Hello";
992            let mut reader = DecodingReader::new(&data[..]);
993
994            let buf = reader.fill_buf().unwrap();
995            assert_eq!(buf, b"Hello");
996
997            reader.set_encoding(encoding_rs::WINDOWS_1252);
998            assert_eq!(reader.encoding(), encoding_rs::WINDOWS_1252);
999
1000            // Buffered data is unchanged
1001            let buf = reader.fill_buf().unwrap();
1002            assert_eq!(buf, b"Hello");
1003        }
1004
1005        /// Calling set_encoding with the already-active encoding is a no-op:
1006        /// the decoder's internal state is preserved and decoding continues
1007        /// without corruption.
1008        #[test]
1009        fn set_encoding_same_as_detected_is_noop() {
1010            let data = b"Hello, World!";
1011            let mut reader = DecodingReader::new(&data[..]);
1012
1013            // Trigger detection and consume the first chunk
1014            let first_chunk;
1015            {
1016                let buf = reader.fill_buf().unwrap();
1017                assert!(buf.len() > 0);
1018                first_chunk = std::str::from_utf8(buf).unwrap().to_string();
1019                let n = buf.len();
1020                reader.consume(n);
1021            }
1022            assert_eq!(reader.encoding(), encoding_rs::UTF_8);
1023
1024            // "Re-set" to the same encoding - must not reset decoder state
1025            reader.set_encoding(encoding_rs::UTF_8);
1026            assert_eq!(reader.encoding(), encoding_rs::UTF_8);
1027
1028            // Read the rest - combined output must equal the original string
1029            let rest = read_all(&mut reader).unwrap();
1030            assert_eq!(format!("{first_chunk}{rest}"), "Hello, World!");
1031        }
1032
1033        /// set_encoding mid-stream: read some UTF-8 data, switch encoding,
1034        /// then verify the encoding accessor reflects the change.
1035        #[test]
1036        fn set_encoding_mid_stream() {
1037            let data = b"Hello, World!";
1038            let mut reader = DecodingReader::new(&data[..]);
1039
1040            // Read a few bytes under UTF-8
1041            let buf = reader.fill_buf().unwrap();
1042            let n = std::cmp::min(buf.len(), 5);
1043            reader.consume(n);
1044
1045            assert_eq!(reader.encoding(), encoding_rs::UTF_8);
1046            reader.set_encoding(encoding_rs::WINDOWS_1252);
1047            assert_eq!(reader.encoding(), encoding_rs::WINDOWS_1252);
1048
1049            // Remaining data still readable (ASCII is identical in both encodings)
1050            let rest = read_all(&mut reader).unwrap();
1051            assert_eq!(rest, ", World!");
1052        }
1053    }
1054
1055    /// Tests exercised across a matrix of (input text x encoding x read strategy).
1056    /// Each test encodes a string, feeds it through DecodingReader, and asserts the
1057    /// decoded output matches the original. This covers BOM detection, UTF-16
1058    /// transcoding, surrogate pairs, and multi-byte UTF-8 characters in one sweep.
1059    ///
1060    /// Examples:
1061    ///
1062    /// - UTF-8 passthrough (ASCII and multibyte) with and without BOM
1063    /// - UTF-16 LE/BE decoding with and without BOM
1064    /// - BOM-less UTF-16 detection via `<?xml` byte pattern
1065    /// - UTF-16 surrogate pairs (astral plane characters)
1066    /// - Chunked input at misaligned boundaries (odd chunk sizes vs 2-byte code units)
1067    /// - One-byte-at-a-time delivery for all encodings
1068    /// - Inputs larger than the 8192-byte internal output buffer
1069    /// - Empty and single-character inputs (prefix-only decode path)
1070    mod matrix_decoding_tests {
1071        use super::*;
1072        use pretty_assertions::assert_eq;
1073
1074        struct TestCase {
1075            label: &'static str,
1076            text: &'static str,
1077        }
1078
1079        /// Short inputs that exercise different Unicode categories.
1080        const CASES: &[TestCase] = &[
1081            TestCase {
1082                label: "empty",
1083                text: "",
1084            },
1085            TestCase {
1086                label: "single_multibyte",
1087                // Single 3-byte character - entire content fits in the prefix buffer
1088                text: "€",
1089            },
1090            TestCase {
1091                label: "ascii",
1092                text: "Hello",
1093            },
1094            TestCase {
1095                label: "multibyte",
1096                // 3-byte CJK + 4-byte emoji
1097                text: "Hello, 世界! 😀",
1098            },
1099            TestCase {
1100                label: "surrogate_pairs",
1101                // U+1D11E and U+1F3B5 require surrogate pairs in UTF-16
1102                text: "Music: 𝄞🎵",
1103            },
1104            TestCase {
1105                label: "xml_declaration",
1106                // Enables BOM-less UTF-16 detection via the <?xml byte pattern
1107                text: "<?xml version=\"1.0\"?><root/>",
1108            },
1109        ];
1110
1111        /// Inputs larger than the 8192-byte internal output buffer.
1112        fn large_cases() -> Vec<(&'static str, String)> {
1113            vec![
1114                ("large_ascii", "abcdefghij".repeat(1000)),
1115                ("large_multibyte", "Hello, 世界! 😀 ".repeat(500)),
1116            ]
1117        }
1118
1119        enum Encoding {
1120            Utf8,
1121            Utf8Bom,
1122            Utf16Le,
1123            Utf16Be,
1124            Utf16LeNoBom,
1125            Utf16BeNoBom,
1126        }
1127
1128        impl Encoding {
1129            fn encode(&self, text: &str) -> Vec<u8> {
1130                match self {
1131                    Encoding::Utf8 => text.as_bytes().to_vec(),
1132                    Encoding::Utf8Bom => {
1133                        let mut out = vec![0xEF, 0xBB, 0xBF];
1134                        out.extend_from_slice(text.as_bytes());
1135                        out
1136                    }
1137                    Encoding::Utf16Le => utf16le_with_bom(text),
1138                    Encoding::Utf16Be => utf16be_with_bom(text),
1139                    Encoding::Utf16LeNoBom => utf16le_no_bom(text),
1140                    Encoding::Utf16BeNoBom => utf16be_no_bom(text),
1141                }
1142            }
1143
1144            fn label(&self) -> &'static str {
1145                match self {
1146                    Encoding::Utf8 => "utf8",
1147                    Encoding::Utf8Bom => "utf8_bom",
1148                    Encoding::Utf16Le => "utf16le",
1149                    Encoding::Utf16Be => "utf16be",
1150                    Encoding::Utf16LeNoBom => "utf16le_no_bom",
1151                    Encoding::Utf16BeNoBom => "utf16be_no_bom",
1152                }
1153            }
1154
1155            /// BOM-less UTF-16 detection requires a `<?xml` prefix, so those
1156            /// encodings are only included for inputs that start with one.
1157            fn all_for(text: &str) -> Vec<Encoding> {
1158                let mut encs = vec![
1159                    Encoding::Utf8,
1160                    Encoding::Utf8Bom,
1161                    Encoding::Utf16Le,
1162                    Encoding::Utf16Be,
1163                ];
1164                if text.starts_with("<?xml") {
1165                    encs.push(Encoding::Utf16LeNoBom);
1166                    encs.push(Encoding::Utf16BeNoBom);
1167                }
1168                encs
1169            }
1170        }
1171
1172        /// Encode -> decode with the entire input available at once.
1173        #[test]
1174        fn bulk_read() {
1175            for case in CASES {
1176                for enc in Encoding::all_for(case.text) {
1177                    let data = enc.encode(case.text);
1178                    let mut reader = DecodingReader::new(&data[..]);
1179                    assert_eq!(
1180                        read_all(&mut reader).unwrap(),
1181                        case.text,
1182                        "bulk_read failed: case={}, encoding={}",
1183                        case.label,
1184                        enc.label(),
1185                    );
1186                }
1187            }
1188            for (label, text) in large_cases() {
1189                for enc in Encoding::all_for(&text) {
1190                    let data = enc.encode(&text);
1191                    let mut reader = DecodingReader::new(&data[..]);
1192                    assert_eq!(
1193                        read_all(&mut reader).unwrap(),
1194                        text,
1195                        "bulk_read failed: case={}, encoding={}",
1196                        label,
1197                        enc.label(),
1198                    );
1199                }
1200            }
1201        }
1202
1203        /// Encode -> decode with the input delivered in fixed-size chunks via
1204        /// ChunkedReader, testing that the decoder handles arbitrary byte
1205        /// boundaries (mid-BOM, mid-code-unit, mid-surrogate-pair).
1206        #[test]
1207        fn chunked_read() {
1208            for case in CASES {
1209                for enc in Encoding::all_for(case.text) {
1210                    for chunk_size in [1, 2, 3, 4, 5] {
1211                        let data = enc.encode(case.text);
1212                        let mut reader = DecodingReader::new(BufReader::new(ChunkedReader::new(
1213                            &data, chunk_size,
1214                        )));
1215                        assert_eq!(
1216                            read_all(&mut reader).unwrap(),
1217                            case.text,
1218                            "chunked_read failed: case={}, encoding={}, chunk_size={}",
1219                            case.label,
1220                            enc.label(),
1221                            chunk_size,
1222                        );
1223                    }
1224                }
1225            }
1226        }
1227
1228        /// Same as chunked_read but with inputs exceeding the 8192-byte
1229        /// internal output buffer, exercising the multi-fill_buf decode loop.
1230        #[test]
1231        fn large_chunked_read() {
1232            for (label, text) in large_cases() {
1233                for enc in Encoding::all_for(&text) {
1234                    for chunk_size in [1, 2, 3, 4, 5] {
1235                        let data = enc.encode(&text);
1236                        let mut reader = DecodingReader::new(BufReader::new(ChunkedReader::new(
1237                            &data, chunk_size,
1238                        )));
1239                        assert_eq!(
1240                            read_all(&mut reader).unwrap(),
1241                            text,
1242                            "large_chunked_read failed: case={}, encoding={}, chunk_size={}",
1243                            label,
1244                            enc.label(),
1245                            chunk_size,
1246                        );
1247                    }
1248                }
1249            }
1250        }
1251    }
1252}