quick_xml/encoding.rs
1//! A module for wrappers that encode / decode data.
2
3use std::borrow::Cow;
4use std::str::Utf8Error;
5
6#[cfg(feature = "encoding")]
7use encoding_rs;
8#[cfg(feature = "encoding")]
9use std::io::{self, BufRead, Read};
10
11/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-8.
12/// See <https://unicode.org/faq/utf_bom.html#bom1>
13pub(crate) const UTF8_BOM: &[u8] = &[0xEF, 0xBB, 0xBF];
14/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with little-endian byte order.
15/// See <https://unicode.org/faq/utf_bom.html#bom1>
16pub(crate) const UTF16_LE_BOM: &[u8] = &[0xFF, 0xFE];
17/// Unicode "byte order mark" (\u{FEFF}) encoded as UTF-16 with big-endian byte order.
18/// See <https://unicode.org/faq/utf_bom.html#bom1>
19pub(crate) const UTF16_BE_BOM: &[u8] = &[0xFE, 0xFF];
20
21/// An error when decoding or encoding
22///
23/// If feature [`encoding`] is disabled, the [`EncodingError`] is always [`EncodingError::Utf8`]
24///
25/// [`encoding`]: ../index.html#encoding
26#[derive(Clone, Debug, PartialEq, Eq)]
27#[non_exhaustive]
28pub enum EncodingError {
29 /// Input was not valid UTF-8
30 Utf8(Utf8Error),
31 /// Input did not adhere to the given encoding
32 #[cfg(feature = "encoding")]
33 Other(&'static encoding_rs::Encoding),
34}
35
36impl From<Utf8Error> for EncodingError {
37 #[inline]
38 fn from(e: Utf8Error) -> Self {
39 Self::Utf8(e)
40 }
41}
42
43impl std::error::Error for EncodingError {
44 fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
45 match self {
46 Self::Utf8(e) => Some(e),
47 #[cfg(feature = "encoding")]
48 Self::Other(_) => None,
49 }
50 }
51}
52
53impl std::fmt::Display for EncodingError {
54 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55 match self {
56 Self::Utf8(e) => write!(f, "cannot decode input using UTF-8: {}", e),
57 #[cfg(feature = "encoding")]
58 Self::Other(encoding) => write!(f, "cannot decode input using {}", encoding.name()),
59 }
60 }
61}
62
63/// Decoder of byte slices into strings.
64///
65/// If feature [`encoding`] is enabled, this encoding taken from the `"encoding"`
66/// XML declaration or assumes UTF-8, if XML has no <?xml ?> declaration, encoding
67/// key is not defined or contains unknown encoding.
68///
69/// The library supports any UTF-8 compatible encodings that crate `encoding_rs`
70/// is supported. [*UTF-16 and ISO-2022-JP are not supported at the present*][utf16].
71///
72/// If feature [`encoding`] is disabled, the decoder is always UTF-8 decoder:
73/// any XML declarations are ignored.
74///
75/// [utf16]: https://github.com/tafia/quick-xml/issues/158
76/// [`encoding`]: ../index.html#encoding
77#[derive(Clone, Copy, Debug, Eq, PartialEq)]
78pub struct Decoder {
79 #[cfg(feature = "encoding")]
80 pub(crate) encoding: &'static encoding_rs::Encoding,
81}
82
83impl Decoder {
84 pub(crate) const fn utf8() -> Self {
85 Decoder {
86 #[cfg(feature = "encoding")]
87 encoding: encoding_rs::UTF_8,
88 }
89 }
90
91 #[cfg(all(test, feature = "encoding", feature = "serialize"))]
92 pub(crate) const fn utf16() -> Self {
93 Decoder {
94 encoding: encoding_rs::UTF_16LE,
95 }
96 }
97}
98
99impl Decoder {
100 /// Returns the `Reader`s encoding.
101 ///
102 /// This encoding will be used by [`decode`].
103 ///
104 /// [`decode`]: Self::decode
105 #[cfg(feature = "encoding")]
106 pub const fn encoding(&self) -> &'static encoding_rs::Encoding {
107 self.encoding
108 }
109
110 /// ## Without `encoding` feature
111 ///
112 /// Decodes an UTF-8 slice regardless of XML declaration and ignoring BOM
113 /// if it is present in the `bytes`.
114 ///
115 /// ## With `encoding` feature
116 ///
117 /// Decodes specified bytes using encoding, declared in the XML, if it was
118 /// declared there, or UTF-8 otherwise, and ignoring BOM if it is present
119 /// in the `bytes`.
120 ///
121 /// ----
122 /// Returns an error in case of malformed sequences in the `bytes`.
123 pub fn decode<'b>(&self, bytes: &'b [u8]) -> Result<Cow<'b, str>, EncodingError> {
124 #[cfg(not(feature = "encoding"))]
125 let decoded = Ok(Cow::Borrowed(std::str::from_utf8(bytes)?));
126
127 #[cfg(feature = "encoding")]
128 let decoded = decode(bytes, self.encoding);
129
130 decoded
131 }
132
133 /// Like [`decode`][Self::decode] but using a pre-allocated buffer.
134 pub fn decode_into(&self, bytes: &[u8], buf: &mut String) -> Result<(), EncodingError> {
135 #[cfg(not(feature = "encoding"))]
136 buf.push_str(std::str::from_utf8(bytes)?);
137
138 #[cfg(feature = "encoding")]
139 decode_into(bytes, self.encoding, buf)?;
140
141 Ok(())
142 }
143
144 /// Decodes the `Cow` buffer, preserves the lifetime
145 pub(crate) fn decode_cow<'b>(
146 &self,
147 bytes: &Cow<'b, [u8]>,
148 ) -> Result<Cow<'b, str>, EncodingError> {
149 match bytes {
150 Cow::Borrowed(bytes) => self.decode(bytes),
151 // Convert to owned, because otherwise Cow will be bound with wrong lifetime
152 Cow::Owned(bytes) => Ok(self.decode(bytes)?.into_owned().into()),
153 }
154 }
155
156 /// Decodes the `Cow` buffer, normalizes XML EOLs, preserves the lifetime
157 pub(crate) fn content<'b>(
158 &self,
159 bytes: &Cow<'b, [u8]>,
160 normalize_eol: impl Fn(&str) -> Cow<str>,
161 ) -> Result<Cow<'b, str>, EncodingError> {
162 match bytes {
163 Cow::Borrowed(bytes) => {
164 let text = self.decode(bytes)?;
165 match normalize_eol(&text) {
166 // If text borrowed after normalization that means that it's not changed
167 Cow::Borrowed(_) => Ok(text),
168 Cow::Owned(s) => Ok(Cow::Owned(s)),
169 }
170 }
171 Cow::Owned(bytes) => {
172 let text = self.decode(bytes)?;
173 let text = normalize_eol(&text);
174 // Convert to owned, because otherwise Cow will be bound with wrong lifetime
175 Ok(text.into_owned().into())
176 }
177 }
178 }
179}
180
181/// Decodes the provided bytes using the specified encoding.
182///
183/// Returns an error in case of malformed or non-representable sequences in the `bytes`.
184#[cfg(feature = "encoding")]
185pub fn decode<'b>(
186 bytes: &'b [u8],
187 encoding: &'static encoding_rs::Encoding,
188) -> Result<Cow<'b, str>, EncodingError> {
189 encoding
190 .decode_without_bom_handling_and_without_replacement(bytes)
191 .ok_or(EncodingError::Other(encoding))
192}
193
194/// Like [`decode`] but using a pre-allocated buffer.
195#[cfg(feature = "encoding")]
196pub fn decode_into(
197 bytes: &[u8],
198 encoding: &'static encoding_rs::Encoding,
199 buf: &mut String,
200) -> Result<(), EncodingError> {
201 if encoding == encoding_rs::UTF_8 {
202 buf.push_str(std::str::from_utf8(bytes)?);
203 return Ok(());
204 }
205
206 let mut decoder = encoding.new_decoder_without_bom_handling();
207 buf.reserve(
208 decoder
209 .max_utf8_buffer_length_without_replacement(bytes.len())
210 // SAFETY: None can be returned only if required size will overflow usize,
211 // but in that case String::reserve also panics
212 .unwrap(),
213 );
214 let (result, read) = decoder.decode_to_string_without_replacement(bytes, buf, true);
215 match result {
216 encoding_rs::DecoderResult::InputEmpty => {
217 debug_assert_eq!(read, bytes.len());
218 Ok(())
219 }
220 encoding_rs::DecoderResult::Malformed(_, _) => Err(EncodingError::Other(encoding)),
221 // SAFETY: We allocate enough space above
222 encoding_rs::DecoderResult::OutputFull => unreachable!(),
223 }
224}
225
226/// Automatic encoding detection of XML files based using the
227/// [recommended algorithm](https://www.w3.org/TR/xml11/#sec-guessing).
228///
229/// If encoding is detected, `Some` is returned with a [`DetectedEncoding`] that provides
230/// the BOM size in bytes (or zero if no BOM was present).
231///
232/// IF encoding was not recognized, `None` is returned.
233///
234/// Because the [`encoding_rs`] crate supports only subset of those encodings, only
235/// the supported subset are detected, which is UTF-8, UTF-16 BE and UTF-16 LE.
236///
237/// The algorithm suggests examine up to the first 4 bytes to determine encoding
238/// according to the following table:
239///
240/// | Bytes |Detected encoding
241/// |-------------|------------------------------------------
242/// | **BOM**
243/// |`FE_FF_##_##`|UTF-16, big-endian
244/// |`FF FE ## ##`|UTF-16, little-endian
245/// |`EF BB BF` |UTF-8
246/// | **No BOM**
247/// |`00 3C 00 3F`|UTF-16 BE or ISO-10646-UCS-2 BE or similar 16-bit BE (use declared encoding to find the exact one)
248/// |`3C 00 3F 00`|UTF-16 LE or ISO-10646-UCS-2 LE or similar 16-bit LE (use declared encoding to find the exact one)
249/// |`3C 3F 78 6D`|UTF-8, ISO 646, ASCII, some part of ISO 8859, Shift-JIS, EUC, or any other 7-bit, 8-bit, or mixed-width encoding which ensures that the characters of ASCII have their normal positions, width, and values; the actual encoding declaration must be read to detect which of these applies, but since all of these encodings use the same bit patterns for the relevant ASCII characters, the encoding declaration itself may be read reliably
250pub fn detect_encoding(bytes: &[u8]) -> Option<DetectedEncoding> {
251 // Prevent suggesting "<?xm". We want to have the same formatted lines for all arms.
252 #[allow(clippy::byte_char_slices)]
253 match bytes {
254 // with BOM
255 _ if bytes.starts_with(UTF16_BE_BOM) => Some(DetectedEncoding::Utf16BeBom),
256 _ if bytes.starts_with(UTF16_LE_BOM) => Some(DetectedEncoding::Utf16LeBom),
257 _ if bytes.starts_with(UTF8_BOM) => Some(DetectedEncoding::Utf8Bom),
258
259 // without BOM
260 _ if bytes.starts_with(&[0x00, b'<', 0x00, b'?']) => Some(DetectedEncoding::Utf16BeLike), // Some BE encoding, for example, UTF-16 or ISO-10646-UCS-2
261 _ if bytes.starts_with(&[b'<', 0x00, b'?', 0x00]) => Some(DetectedEncoding::Utf16LeLike), // Some LE encoding, for example, UTF-16 or ISO-10646-UCS-2
262 _ if bytes.starts_with(&[b'<', b'?', b'x', b'm']) => {
263 Some(DetectedEncoding::AsciiCompatible)
264 } // Some ASCII compatible
265
266 _ => None,
267 }
268}
269
270/// Possible scenarios for start-of-xml detection of encoding
271///
272/// See the documentation of [`detect_encoding`]
273pub enum DetectedEncoding {
274 /// Matches UTF-8 or some other ascii-compatible encoding
275 AsciiCompatible,
276 /// We saw a UTF-8 BOM
277 Utf8Bom,
278 /// Matches UTF-16-LE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2)
279 Utf16LeLike,
280 /// We saw a UTF-16 BOM in little-endian orientation
281 Utf16LeBom,
282 /// Matches UTF-16-BE or some other UTF-16 compatible encoding (e.g. ISO-10646-UCS-2)
283 Utf16BeLike,
284 /// We saw a UTF-16 BOM in big-endian orientation
285 Utf16BeBom,
286}
287
288impl DetectedEncoding {
289 /// Return an Encoding object appropriate for the detected encoding
290 #[cfg(feature = "encoding")]
291 pub const fn encoding(&self) -> &'static encoding_rs::Encoding {
292 match self {
293 DetectedEncoding::AsciiCompatible | DetectedEncoding::Utf8Bom => encoding_rs::UTF_8,
294 DetectedEncoding::Utf16LeLike | DetectedEncoding::Utf16LeBom => encoding_rs::UTF_16LE,
295 DetectedEncoding::Utf16BeLike | DetectedEncoding::Utf16BeBom => encoding_rs::UTF_16BE,
296 }
297 }
298
299 /// Length of the BOM, which may need to be stripped from the input
300 pub const fn bom_len(&self) -> usize {
301 match self {
302 DetectedEncoding::Utf8Bom => 3,
303 DetectedEncoding::Utf16LeBom | DetectedEncoding::Utf16BeBom => 2,
304 DetectedEncoding::AsciiCompatible
305 | DetectedEncoding::Utf16LeLike
306 | DetectedEncoding::Utf16BeLike => 0,
307 }
308 }
309}
310
311// Bytes read upfront so `set_encoding()` can be called before the main
312// decode loop. Kept small (just enough for an XML declaration) to limit
313// bytes decoded with a potentially wrong initial encoding.
314#[cfg(feature = "encoding")]
315const PREFIX_CAP: usize = 64;
316
317#[cfg(feature = "encoding")]
318struct Prefix {
319 buf: [u8; PREFIX_CAP],
320 len: usize,
321 detected: bool,
322}
323
324/// A reader wrapper that decodes a byte stream from any encoding into UTF-8.
325///
326/// This reader wraps a [`BufRead`] source and uses [`encoding_rs::Decoder`] to
327/// transcode the input into valid UTF-8. On first access, it detects the encoding
328/// from BOM or XML declaration byte patterns and configures the appropriate decoder.
329///
330/// For UTF-8 input, this acts as a validating passthrough. For UTF-16 or other
331/// encodings, the bytes are transcoded into UTF-8 in an internal buffer.
332///
333/// # Examples
334///
335/// ```
336/// use std::io::Read;
337/// use quick_xml::encoding::DecodingReader;
338///
339/// // UTF-8 input passes through:
340/// let data = b"Hello, World!";
341/// let mut reader = DecodingReader::new(&data[..]);
342/// let mut buf = Vec::new();
343/// reader.read_to_end(&mut buf).unwrap();
344/// assert_eq!(buf, data);
345/// ```
346///
347/// The example below shows how you can read documents using `DecodingReader`:
348/// ```
349/// use quick_xml::encoding::DecodingReader;
350/// use quick_xml::events::Event;
351/// use quick_xml::reader::Reader;
352///
353/// # fn to_utf16le_with_bom(string: &str) -> Vec<u8> {
354/// # let mut bytes = Vec::new();
355/// # bytes.extend_from_slice(&[0xFF, 0xFE]); // UTF-16 LE BOM
356/// # for ch in string.encode_utf16() {
357/// # bytes.extend_from_slice(&ch.to_le_bytes());
358/// # }
359/// # bytes
360/// # }
361/// let xml = to_utf16le_with_bom("<?xml encoding='UTF-16'?><element/>");
362/// let mut decoder = DecodingReader::new(xml.as_ref());
363/// let mut reader = Reader::from_reader(decoder);
364///
365/// let mut buf = Vec::new();
366/// loop {
367/// buf.clear();
368/// match reader.read_event_into(&mut buf).unwrap() {
369/// Event::Decl(e) => {
370/// // If XML declaration contains unknown encoding name, None is returned
371/// match e.encoder() {
372/// Some(encoding) => reader.get_mut().set_encoding(encoding),
373/// None => panic!("Unsupported encoding {:?}", e.encoding()),
374/// }
375/// }
376/// Event::Eof => break,
377/// _ => {}
378/// }
379/// }
380/// ```
381#[cfg(feature = "encoding")]
382pub struct DecodingReader<R> {
383 inner: R,
384 decoder: encoding_rs::Decoder,
385 /// `encoding_rs::Decoder` panics if called after finalization (`last=true`).
386 /// This flag prevents that by short-circuiting `fill_buf` after completion.
387 decoder_finished: bool,
388 /// Decoded UTF-8 output buffer
389 out_buf: Box<[u8]>,
390 /// Start of unconsumed data in out_buf
391 out_pos: usize,
392 /// End of valid data in out_buf
393 out_len: usize,
394 /// Bytes read upfront for encoding detection and XML declaration buffering.
395 /// `Some` until the prefix is fully drained; `None` afterward (main decode
396 /// path takes over and the allocation is freed).
397 prefix: Option<Box<Prefix>>,
398 /// Whether the inner reader has reached EOF
399 inner_eof: bool,
400}
401
402#[cfg(feature = "encoding")]
403impl<R: std::fmt::Debug> std::fmt::Debug for DecodingReader<R> {
404 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
405 f.debug_struct("DecodingReader")
406 .field("inner", &self.inner)
407 .field("encoding", &self.decoder.encoding())
408 .field("out_pos", &self.out_pos)
409 .field("out_len", &self.out_len)
410 .field("inner_eof", &self.inner_eof)
411 .field("prefix_active", &self.prefix.is_some())
412 .finish()
413 }
414}
415
416#[cfg(feature = "encoding")]
417impl<R> DecodingReader<R> {
418 /// Creates a new decoding reader.
419 ///
420 /// The encoding is auto-detected from BOM or XML declaration patterns on
421 /// first access. Defaults to UTF-8 if no pattern is recognized.
422 pub fn new(inner: R) -> Self {
423 Self {
424 inner,
425 decoder: encoding_rs::UTF_8.new_decoder_without_bom_handling(),
426 decoder_finished: false,
427 out_buf: vec![0u8; 8192].into_boxed_slice(),
428 out_pos: 0,
429 out_len: 0,
430 prefix: Some(Box::new(Prefix {
431 buf: [0; PREFIX_CAP],
432 len: 0,
433 detected: false,
434 })),
435 inner_eof: false,
436 }
437 }
438
439 /// Returns a reference to the underlying reader
440 pub const fn get_ref(&self) -> &R {
441 &self.inner
442 }
443
444 /// Returns a mutable reference to the underlying reader
445 pub const fn get_mut(&mut self) -> &mut R {
446 &mut self.inner
447 }
448
449 /// Consumes this reader and returns the underlying reader
450 pub fn into_inner(self) -> R {
451 self.inner
452 }
453
454 /// Returns the encoding currently used by the decoder.
455 ///
456 /// Before the first read, this is always UTF-8. After encoding detection
457 /// it reflects the detected (or overridden) encoding.
458 pub fn encoding(&self) -> &'static encoding_rs::Encoding {
459 self.decoder.encoding()
460 }
461
462 /// Replaces the decoder with one for the given encoding. The encoding
463 /// must be ASCII-compatible (the parser cannot read the declaration otherwise).
464 ///
465 /// # Panics
466 ///
467 /// Panics if the prefix buffer has already been drained. Must be called
468 /// before the prefix is exhausted — in practice, right after parsing
469 /// the XML declaration.
470 pub fn set_encoding(&mut self, encoding: &'static encoding_rs::Encoding) {
471 // No-op when the encoding matches - replacing the decoder would discard
472 // its internal state (e.g. a partial multi-byte sequence), corrupting output.
473 // This check is safe regardless of prefix state since nothing changes.
474 if self.decoder.encoding() == encoding {
475 return;
476 }
477 assert!(
478 self.prefix.is_some(),
479 "set_encoding() called after prefix buffer was drained; \
480 encoding can only be changed while the prefix is still active"
481 );
482 self.decoder = encoding.new_decoder_without_bom_handling();
483 self.decoder_finished = false;
484 }
485}
486
487#[cfg(feature = "encoding")]
488impl<R: BufRead> BufRead for DecodingReader<R> {
489 fn fill_buf(&mut self) -> io::Result<&[u8]> {
490 // Fast path: serve already-decoded data
491 if self.out_pos < self.out_len {
492 return Ok(&self.out_buf[self.out_pos..self.out_len]);
493 }
494
495 // Reset output buffer
496 self.out_pos = 0;
497 self.out_len = 0;
498
499 if let Some(prefix) = &mut self.prefix {
500 // On first access, fill the prefix buffer and detect encoding.
501 // The prefix is large enough to hold an entire XML declaration,
502 // ensuring set_encoding() can be called before the greedy main
503 // decode path consumes from inner.
504 if !prefix.detected {
505 prefix.detected = true;
506
507 while prefix.len < PREFIX_CAP {
508 match self.inner.read(&mut prefix.buf[prefix.len..]) {
509 Ok(0) => {
510 self.inner_eof = true;
511 break;
512 }
513 Ok(n) => prefix.len += n,
514 Err(e) if e.kind() == io::ErrorKind::Interrupted => continue,
515 Err(e) => return Err(e),
516 }
517 }
518
519 let detection_bytes = &prefix.buf[..prefix.len];
520 if let Some(detected) = detect_encoding(detection_bytes) {
521 let bom_len = detected.bom_len();
522 if bom_len > 0 {
523 prefix.buf.copy_within(bom_len..prefix.len, 0);
524 prefix.len -= bom_len;
525 }
526 let encoding = detected.encoding();
527 if encoding != encoding_rs::UTF_8 {
528 self.decoder = encoding.new_decoder_without_bom_handling();
529 }
530 }
531 }
532
533 if self.decoder_finished {
534 return Ok(&[]);
535 }
536
537 // Prefix fully decoded on a previous call - drop it and fall
538 // through to the main decode path.
539 if prefix.len == 0 {
540 self.prefix = None;
541 } else {
542 // Decode from prefix buffer
543 let src = &prefix.buf[..prefix.len];
544 let (result, read, written) = self.decoder.decode_to_utf8_without_replacement(
545 src,
546 &mut self.out_buf[..],
547 false,
548 );
549 prefix.buf.copy_within(read..prefix.len, 0);
550 prefix.len -= read;
551 self.out_len = written;
552
553 match result {
554 encoding_rs::DecoderResult::InputEmpty if written > 0 => {
555 return Ok(&self.out_buf[..self.out_len]);
556 }
557 encoding_rs::DecoderResult::InputEmpty => {
558 // prefix.len is now 0; keep prefix alive for
559 // set_encoding() - it will be dropped on the next call.
560 }
561 encoding_rs::DecoderResult::OutputFull => {
562 return Ok(&self.out_buf[..self.out_len]);
563 }
564 encoding_rs::DecoderResult::Malformed(_, _) => {
565 return Err(io::Error::new(
566 io::ErrorKind::InvalidData,
567 EncodingError::Other(self.decoder.encoding()),
568 ));
569 }
570 }
571 // InputEmpty with written == 0: prefix drained, decoder may
572 // hold partial internal state (e.g. a lone byte of UTF-16).
573 // Drop prefix and fall through to the main decode path.
574 if prefix.len == 0 {
575 self.prefix = None;
576 }
577 }
578 }
579
580 if self.decoder_finished {
581 return Ok(&[]);
582 }
583
584 // Loop until we produce output, hit EOF, or get an error.
585 // The decoder may consume input into internal state (e.g., partial
586 // UTF-16 code unit) without producing output - we must keep feeding
587 // it more input rather than returning an empty slice (which signals EOF).
588 loop {
589 // EOF flush path: tell decoder this is the last chunk
590 if self.inner_eof {
591 let (result, _, written) = self.decoder.decode_to_utf8_without_replacement(
592 b"",
593 &mut self.out_buf[..],
594 true,
595 );
596 self.out_len = written;
597 match result {
598 encoding_rs::DecoderResult::InputEmpty => {
599 self.decoder_finished = true;
600 return Ok(&self.out_buf[..self.out_len]);
601 }
602 encoding_rs::DecoderResult::OutputFull => {
603 return Ok(&self.out_buf[..self.out_len]);
604 }
605 encoding_rs::DecoderResult::Malformed(_, _) => {
606 return Err(io::Error::new(
607 io::ErrorKind::InvalidData,
608 EncodingError::Other(self.decoder.encoding()),
609 ));
610 }
611 }
612 }
613
614 // Main decode path: read from inner, decode into out_buf
615 let (result, read, written) = {
616 let src = self.inner.fill_buf()?;
617 if src.is_empty() {
618 self.inner_eof = true;
619 continue; // will hit EOF flush path on next iteration
620 }
621 self.decoder
622 .decode_to_utf8_without_replacement(src, &mut self.out_buf[..], false)
623 };
624 self.inner.consume(read);
625 self.out_len = written;
626
627 match result {
628 encoding_rs::DecoderResult::InputEmpty if written > 0 => {
629 return Ok(&self.out_buf[..self.out_len]);
630 }
631 encoding_rs::DecoderResult::InputEmpty => {
632 // Decoder consumed all input but produced no output
633 // (e.g., 1 byte of a 2-byte UTF-16 code unit stored
634 // in decoder internal state). Loop to get more input.
635 }
636 encoding_rs::DecoderResult::OutputFull => {
637 // Output buffer full; return what we have. Remaining
638 // input will be decoded on the next fill_buf call.
639 return Ok(&self.out_buf[..self.out_len]);
640 }
641 encoding_rs::DecoderResult::Malformed(_, _) => {
642 return Err(io::Error::new(
643 io::ErrorKind::InvalidData,
644 EncodingError::Other(self.decoder.encoding()),
645 ));
646 }
647 }
648 }
649 }
650
651 fn consume(&mut self, amt: usize) {
652 debug_assert!(
653 self.out_pos + amt <= self.out_len,
654 "consume({amt}) out of range: out_pos={}, out_len={}",
655 self.out_pos,
656 self.out_len,
657 );
658 self.out_pos += amt;
659 }
660}
661
662#[cfg(feature = "encoding")]
663impl<R: BufRead> Read for DecodingReader<R> {
664 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
665 if buf.is_empty() {
666 return Ok(0);
667 }
668 let available = self.fill_buf()?;
669 if available.is_empty() {
670 return Ok(0);
671 }
672 let len = available.len().min(buf.len());
673 buf[..len].copy_from_slice(&available[..len]);
674 self.consume(len);
675 Ok(len)
676 }
677}
678
679#[cfg(all(test, feature = "encoding"))]
680mod decoding_reader {
681 use super::*;
682 use std::io::{BufReader, Read};
683
684 /// Helper reader that returns data in fixed-size chunks
685 struct ChunkedReader<'a> {
686 data: &'a [u8],
687 pos: usize,
688 chunk_size: usize,
689 }
690
691 impl<'a> ChunkedReader<'a> {
692 fn new(data: &'a [u8], chunk_size: usize) -> Self {
693 Self {
694 data,
695 pos: 0,
696 chunk_size,
697 }
698 }
699 }
700
701 impl<'a> Read for ChunkedReader<'a> {
702 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
703 if self.pos >= self.data.len() {
704 return Ok(0);
705 }
706 let len = self
707 .chunk_size
708 .min(buf.len())
709 .min(self.data.len() - self.pos);
710 buf[..len].copy_from_slice(&self.data[self.pos..self.pos + len]);
711 self.pos += len;
712 Ok(len)
713 }
714 }
715
716 /// Encode a string as UTF-16 LE bytes with BOM
717 fn utf16le_with_bom(s: &str) -> Vec<u8> {
718 let mut out = vec![0xFF, 0xFE]; // UTF-16 LE BOM
719 for code_unit in s.encode_utf16() {
720 out.extend_from_slice(&code_unit.to_le_bytes());
721 }
722 out
723 }
724
725 /// Encode a string as UTF-16 BE bytes with BOM
726 fn utf16be_with_bom(s: &str) -> Vec<u8> {
727 let mut out = vec![0xFE, 0xFF]; // UTF-16 BE BOM
728 for code_unit in s.encode_utf16() {
729 out.extend_from_slice(&code_unit.to_be_bytes());
730 }
731 out
732 }
733
734 /// Encode a string as UTF-16 LE bytes without BOM
735 fn utf16le_no_bom(s: &str) -> Vec<u8> {
736 let mut out = Vec::new();
737 for code_unit in s.encode_utf16() {
738 out.extend_from_slice(&code_unit.to_le_bytes());
739 }
740 out
741 }
742
743 /// Encode a string as UTF-16 BE bytes without BOM
744 fn utf16be_no_bom(s: &str) -> Vec<u8> {
745 let mut out = Vec::new();
746 for code_unit in s.encode_utf16() {
747 out.extend_from_slice(&code_unit.to_be_bytes());
748 }
749 out
750 }
751
752 /// Read all bytes from a reader into a String
753 fn read_all(reader: &mut DecodingReader<impl BufRead>) -> io::Result<String> {
754 let mut result = Vec::new();
755 reader.read_to_end(&mut result)?;
756 Ok(String::from_utf8(result).expect("DecodingReader should produce valid UTF-8"))
757 }
758
759 /// Simple edge cases and degenerate inputs
760 mod edge_cases {
761 use super::*;
762 use pretty_assertions::assert_eq;
763
764 /// Zero-length input should immediately return EOF (n == 0).
765 #[test]
766 fn empty_input() {
767 let data = b"";
768 let mut reader = DecodingReader::new(&data[..]);
769 let mut buf = [0u8; 10];
770 let n = reader.read(&mut buf).unwrap();
771 assert_eq!(n, 0);
772 }
773
774 /// A UTF-8 BOM with no payload should decode to an empty string.
775 #[test]
776 fn utf8_bom_only() {
777 let data = b"\xEF\xBB\xBF";
778 let mut reader = DecodingReader::new(&data[..]);
779 assert_eq!(read_all(&mut reader).unwrap(), "");
780 }
781
782 /// A UTF-16 LE BOM with no payload should decode to an empty string.
783 #[test]
784 fn utf16le_bom_only() {
785 let data = &[0xFF, 0xFE];
786 let mut reader = DecodingReader::new(&data[..]);
787 assert_eq!(read_all(&mut reader).unwrap(), "");
788 }
789
790 /// A UTF-16 BE BOM with no payload should decode to an empty string.
791 #[test]
792 fn utf16be_bom_only() {
793 let data = &[0xFE, 0xFF];
794 let mut reader = DecodingReader::new(&data[..]);
795 assert_eq!(read_all(&mut reader).unwrap(), "");
796 }
797
798 /// Invalid UTF-8 (no BOM, so treated as UTF-8) must produce an error.
799 #[test]
800 fn invalid_utf8_is_rejected() {
801 let data: &[u8] = &[0x48, 0x65, 0x6C, 0xFF, 0xFE];
802 let mut reader = DecodingReader::new(&data[..]);
803 let err = read_all(&mut reader).unwrap_err();
804 assert_eq!(err.kind(), io::ErrorKind::InvalidData);
805 }
806
807 /// An odd trailing byte in UTF-16 is malformed and must produce an error.
808 #[test]
809 fn truncated_utf16_at_eof() {
810 // UTF-16 LE BOM + one valid code unit + one incomplete byte
811 let data: &[u8] = &[0xFF, 0xFE, 0x48, 0x00, 0x65];
812 let mut reader = DecodingReader::new(&data[..]);
813 let err = read_all(&mut reader).unwrap_err();
814 assert_eq!(err.kind(), io::ErrorKind::InvalidData);
815 }
816
817 /// A 1-byte output buffer forces one byte per read() call; verifies
818 /// multi-byte UTF-8 sequences are still assembled correctly.
819 #[test]
820 fn read_with_one_byte_buffer() {
821 let data = "Hello, 世界!".as_bytes();
822 let mut reader = DecodingReader::new(&data[..]);
823 let mut result = Vec::new();
824 let mut buf = [0u8; 1];
825 loop {
826 let n = reader.read(&mut buf).unwrap();
827 if n == 0 {
828 break;
829 }
830 result.extend_from_slice(&buf[..n]);
831 }
832 assert_eq!(String::from_utf8(result).unwrap(), "Hello, 世界!");
833 }
834 }
835
836 /// Tests that exercise the BufRead contract (fill_buf + consume) directly,
837 /// as opposed to the Read-based helpers used elsewhere.
838 mod bufread_interface {
839 use super::*;
840 use pretty_assertions::assert_eq;
841 use std::io::BufRead;
842
843 /// Basic fill_buf/consume cycle: partial consume leaves remaining
844 /// data available on the next fill_buf call.
845 #[test]
846 fn fill_buf_and_consume() {
847 let data = b"Hello, World!";
848 let mut reader = DecodingReader::new(&data[..]);
849
850 let buf = reader.fill_buf().unwrap();
851 assert!(!buf.is_empty());
852 assert_eq!(buf[0], b'H');
853
854 // Consume only part of the buffer
855 reader.consume(5);
856
857 let buf = reader.fill_buf().unwrap();
858 assert!(!buf.is_empty());
859 assert_eq!(buf[0], b',');
860 }
861
862 /// Drain the reader via fill_buf/consume, then confirm it stays at EOF.
863 #[test]
864 fn partial_consume_then_read_more() {
865 let data = b"Hello, World!";
866 let mut reader = DecodingReader::new(&data[..]);
867
868 // Collect all output via fill_buf/consume
869 let mut result = Vec::new();
870 loop {
871 let buf = reader.fill_buf().unwrap();
872 if buf.is_empty() {
873 break;
874 }
875 result.extend_from_slice(buf);
876 let len = buf.len();
877 reader.consume(len);
878 }
879 assert_eq!(std::str::from_utf8(&result).unwrap(), "Hello, World!");
880
881 // Should remain at EOF
882 let buf = reader.fill_buf().unwrap();
883 assert!(buf.is_empty());
884 }
885
886 /// Calling fill_buf() repeatedly after EOF must keep returning empty
887 /// (and not panic - encoding_rs::Decoder panics if called after finalization).
888 #[test]
889 fn fill_buf_after_eof_is_idempotent() {
890 let data = b"Hello";
891 let mut reader = DecodingReader::new(&data[..]);
892
893 loop {
894 let buf = reader.fill_buf().unwrap();
895 if buf.is_empty() {
896 break;
897 }
898 let len = buf.len();
899 reader.consume(len);
900 }
901
902 for _ in 0..3 {
903 let buf = reader.fill_buf().unwrap();
904 assert!(buf.is_empty());
905 }
906 }
907
908 /// consume() past the buffered length must trigger a debug_assert panic.
909 #[test]
910 #[should_panic(expected = "consume")]
911 fn consume_overflow_panics_in_debug() {
912 let data = b"Hi";
913 let mut reader = DecodingReader::new(&data[..]);
914 let _ = reader.fill_buf().unwrap();
915 reader.consume(100);
916 }
917 }
918
919 mod accessors {
920 use super::*;
921 use pretty_assertions::assert_eq;
922 use std::io::Cursor;
923
924 #[test]
925 fn get_ref() {
926 let data = b"Hello";
927 let cursor = Cursor::new(data.to_vec());
928 let reader = DecodingReader::new(cursor);
929 assert_eq!(reader.get_ref().get_ref(), data);
930 }
931
932 #[test]
933 fn get_mut() {
934 let data = b"Hello";
935 let cursor = Cursor::new(data.to_vec());
936 let mut reader = DecodingReader::new(cursor);
937 reader.get_mut().set_position(2);
938 assert_eq!(reader.get_ref().position(), 2);
939 }
940
941 #[test]
942 fn into_inner() {
943 let data = b"Hello";
944 let cursor = Cursor::new(data.to_vec());
945 let reader = DecodingReader::new(cursor);
946 let inner = reader.into_inner();
947 assert_eq!(inner.get_ref(), data);
948 }
949
950 /// Default encoding before any reads is UTF-8.
951 #[test]
952 fn encoding_default_is_utf8() {
953 let reader = DecodingReader::new(&b"Hello"[..]);
954 assert_eq!(reader.encoding(), encoding_rs::UTF_8);
955 }
956 }
957
958 // TODO: These tests emulate the updating of the internal decoder after reading the XML decl.
959 // Since `Reader` currently only speaks the `BufRead` trait, we can't test that directly.
960 // Eventually once `Reader` knows about the underlying `DecodingReader` we should test
961 // that directly.
962
963 /// Tests for encoding() and set_encoding(): detection, switching,
964 /// same-encoding no-op safety, and mid-stream override behavior.
965 mod encoding_switching {
966 use super::*;
967 use pretty_assertions::assert_eq;
968 use std::io::BufRead;
969
970 /// Encoding reflects BOM detection after first read.
971 #[test]
972 fn encoding_reflects_detection() {
973 let data = utf16le_with_bom("Hello");
974 let mut reader = DecodingReader::new(&data[..]);
975 let _ = read_all(&mut reader).unwrap();
976 assert_eq!(reader.encoding(), encoding_rs::UTF_16LE);
977 }
978
979 /// set_encoding switches the active decoder.
980 #[test]
981 fn set_encoding_changes_encoding() {
982 let mut reader = DecodingReader::new(&b"Hello"[..]);
983 assert_eq!(reader.encoding(), encoding_rs::UTF_8);
984 reader.set_encoding(encoding_rs::UTF_16LE);
985 assert_eq!(reader.encoding(), encoding_rs::UTF_16LE);
986 }
987
988 /// set_encoding after reading preserves already-buffered output.
989 #[test]
990 fn set_encoding_preserves_buffered_output() {
991 let data = b"Hello";
992 let mut reader = DecodingReader::new(&data[..]);
993
994 let buf = reader.fill_buf().unwrap();
995 assert_eq!(buf, b"Hello");
996
997 reader.set_encoding(encoding_rs::WINDOWS_1252);
998 assert_eq!(reader.encoding(), encoding_rs::WINDOWS_1252);
999
1000 // Buffered data is unchanged
1001 let buf = reader.fill_buf().unwrap();
1002 assert_eq!(buf, b"Hello");
1003 }
1004
1005 /// Calling set_encoding with the already-active encoding is a no-op:
1006 /// the decoder's internal state is preserved and decoding continues
1007 /// without corruption.
1008 #[test]
1009 fn set_encoding_same_as_detected_is_noop() {
1010 let data = b"Hello, World!";
1011 let mut reader = DecodingReader::new(&data[..]);
1012
1013 // Trigger detection and consume the first chunk
1014 let first_chunk;
1015 {
1016 let buf = reader.fill_buf().unwrap();
1017 assert!(buf.len() > 0);
1018 first_chunk = std::str::from_utf8(buf).unwrap().to_string();
1019 let n = buf.len();
1020 reader.consume(n);
1021 }
1022 assert_eq!(reader.encoding(), encoding_rs::UTF_8);
1023
1024 // "Re-set" to the same encoding - must not reset decoder state
1025 reader.set_encoding(encoding_rs::UTF_8);
1026 assert_eq!(reader.encoding(), encoding_rs::UTF_8);
1027
1028 // Read the rest - combined output must equal the original string
1029 let rest = read_all(&mut reader).unwrap();
1030 assert_eq!(format!("{first_chunk}{rest}"), "Hello, World!");
1031 }
1032
1033 /// set_encoding mid-stream: read some UTF-8 data, switch encoding,
1034 /// then verify the encoding accessor reflects the change.
1035 #[test]
1036 fn set_encoding_mid_stream() {
1037 let data = b"Hello, World!";
1038 let mut reader = DecodingReader::new(&data[..]);
1039
1040 // Read a few bytes under UTF-8
1041 let buf = reader.fill_buf().unwrap();
1042 let n = std::cmp::min(buf.len(), 5);
1043 reader.consume(n);
1044
1045 assert_eq!(reader.encoding(), encoding_rs::UTF_8);
1046 reader.set_encoding(encoding_rs::WINDOWS_1252);
1047 assert_eq!(reader.encoding(), encoding_rs::WINDOWS_1252);
1048
1049 // Remaining data still readable (ASCII is identical in both encodings)
1050 let rest = read_all(&mut reader).unwrap();
1051 assert_eq!(rest, ", World!");
1052 }
1053 }
1054
1055 /// Tests exercised across a matrix of (input text x encoding x read strategy).
1056 /// Each test encodes a string, feeds it through DecodingReader, and asserts the
1057 /// decoded output matches the original. This covers BOM detection, UTF-16
1058 /// transcoding, surrogate pairs, and multi-byte UTF-8 characters in one sweep.
1059 ///
1060 /// Examples:
1061 ///
1062 /// - UTF-8 passthrough (ASCII and multibyte) with and without BOM
1063 /// - UTF-16 LE/BE decoding with and without BOM
1064 /// - BOM-less UTF-16 detection via `<?xml` byte pattern
1065 /// - UTF-16 surrogate pairs (astral plane characters)
1066 /// - Chunked input at misaligned boundaries (odd chunk sizes vs 2-byte code units)
1067 /// - One-byte-at-a-time delivery for all encodings
1068 /// - Inputs larger than the 8192-byte internal output buffer
1069 /// - Empty and single-character inputs (prefix-only decode path)
1070 mod matrix_decoding_tests {
1071 use super::*;
1072 use pretty_assertions::assert_eq;
1073
1074 struct TestCase {
1075 label: &'static str,
1076 text: &'static str,
1077 }
1078
1079 /// Short inputs that exercise different Unicode categories.
1080 const CASES: &[TestCase] = &[
1081 TestCase {
1082 label: "empty",
1083 text: "",
1084 },
1085 TestCase {
1086 label: "single_multibyte",
1087 // Single 3-byte character - entire content fits in the prefix buffer
1088 text: "€",
1089 },
1090 TestCase {
1091 label: "ascii",
1092 text: "Hello",
1093 },
1094 TestCase {
1095 label: "multibyte",
1096 // 3-byte CJK + 4-byte emoji
1097 text: "Hello, 世界! 😀",
1098 },
1099 TestCase {
1100 label: "surrogate_pairs",
1101 // U+1D11E and U+1F3B5 require surrogate pairs in UTF-16
1102 text: "Music: 𝄞🎵",
1103 },
1104 TestCase {
1105 label: "xml_declaration",
1106 // Enables BOM-less UTF-16 detection via the <?xml byte pattern
1107 text: "<?xml version=\"1.0\"?><root/>",
1108 },
1109 ];
1110
1111 /// Inputs larger than the 8192-byte internal output buffer.
1112 fn large_cases() -> Vec<(&'static str, String)> {
1113 vec![
1114 ("large_ascii", "abcdefghij".repeat(1000)),
1115 ("large_multibyte", "Hello, 世界! 😀 ".repeat(500)),
1116 ]
1117 }
1118
1119 enum Encoding {
1120 Utf8,
1121 Utf8Bom,
1122 Utf16Le,
1123 Utf16Be,
1124 Utf16LeNoBom,
1125 Utf16BeNoBom,
1126 }
1127
1128 impl Encoding {
1129 fn encode(&self, text: &str) -> Vec<u8> {
1130 match self {
1131 Encoding::Utf8 => text.as_bytes().to_vec(),
1132 Encoding::Utf8Bom => {
1133 let mut out = vec![0xEF, 0xBB, 0xBF];
1134 out.extend_from_slice(text.as_bytes());
1135 out
1136 }
1137 Encoding::Utf16Le => utf16le_with_bom(text),
1138 Encoding::Utf16Be => utf16be_with_bom(text),
1139 Encoding::Utf16LeNoBom => utf16le_no_bom(text),
1140 Encoding::Utf16BeNoBom => utf16be_no_bom(text),
1141 }
1142 }
1143
1144 fn label(&self) -> &'static str {
1145 match self {
1146 Encoding::Utf8 => "utf8",
1147 Encoding::Utf8Bom => "utf8_bom",
1148 Encoding::Utf16Le => "utf16le",
1149 Encoding::Utf16Be => "utf16be",
1150 Encoding::Utf16LeNoBom => "utf16le_no_bom",
1151 Encoding::Utf16BeNoBom => "utf16be_no_bom",
1152 }
1153 }
1154
1155 /// BOM-less UTF-16 detection requires a `<?xml` prefix, so those
1156 /// encodings are only included for inputs that start with one.
1157 fn all_for(text: &str) -> Vec<Encoding> {
1158 let mut encs = vec![
1159 Encoding::Utf8,
1160 Encoding::Utf8Bom,
1161 Encoding::Utf16Le,
1162 Encoding::Utf16Be,
1163 ];
1164 if text.starts_with("<?xml") {
1165 encs.push(Encoding::Utf16LeNoBom);
1166 encs.push(Encoding::Utf16BeNoBom);
1167 }
1168 encs
1169 }
1170 }
1171
1172 /// Encode -> decode with the entire input available at once.
1173 #[test]
1174 fn bulk_read() {
1175 for case in CASES {
1176 for enc in Encoding::all_for(case.text) {
1177 let data = enc.encode(case.text);
1178 let mut reader = DecodingReader::new(&data[..]);
1179 assert_eq!(
1180 read_all(&mut reader).unwrap(),
1181 case.text,
1182 "bulk_read failed: case={}, encoding={}",
1183 case.label,
1184 enc.label(),
1185 );
1186 }
1187 }
1188 for (label, text) in large_cases() {
1189 for enc in Encoding::all_for(&text) {
1190 let data = enc.encode(&text);
1191 let mut reader = DecodingReader::new(&data[..]);
1192 assert_eq!(
1193 read_all(&mut reader).unwrap(),
1194 text,
1195 "bulk_read failed: case={}, encoding={}",
1196 label,
1197 enc.label(),
1198 );
1199 }
1200 }
1201 }
1202
1203 /// Encode -> decode with the input delivered in fixed-size chunks via
1204 /// ChunkedReader, testing that the decoder handles arbitrary byte
1205 /// boundaries (mid-BOM, mid-code-unit, mid-surrogate-pair).
1206 #[test]
1207 fn chunked_read() {
1208 for case in CASES {
1209 for enc in Encoding::all_for(case.text) {
1210 for chunk_size in [1, 2, 3, 4, 5] {
1211 let data = enc.encode(case.text);
1212 let mut reader = DecodingReader::new(BufReader::new(ChunkedReader::new(
1213 &data, chunk_size,
1214 )));
1215 assert_eq!(
1216 read_all(&mut reader).unwrap(),
1217 case.text,
1218 "chunked_read failed: case={}, encoding={}, chunk_size={}",
1219 case.label,
1220 enc.label(),
1221 chunk_size,
1222 );
1223 }
1224 }
1225 }
1226 }
1227
1228 /// Same as chunked_read but with inputs exceeding the 8192-byte
1229 /// internal output buffer, exercising the multi-fill_buf decode loop.
1230 #[test]
1231 fn large_chunked_read() {
1232 for (label, text) in large_cases() {
1233 for enc in Encoding::all_for(&text) {
1234 for chunk_size in [1, 2, 3, 4, 5] {
1235 let data = enc.encode(&text);
1236 let mut reader = DecodingReader::new(BufReader::new(ChunkedReader::new(
1237 &data, chunk_size,
1238 )));
1239 assert_eq!(
1240 read_all(&mut reader).unwrap(),
1241 text,
1242 "large_chunked_read failed: case={}, encoding={}, chunk_size={}",
1243 label,
1244 enc.label(),
1245 chunk_size,
1246 );
1247 }
1248 }
1249 }
1250 }
1251 }
1252}