encoding_rs_io/lib.rs
1/*!
2This crate provides streaming transcoding by implementing Rust's I/O traits
3and delegating transcoding to the
4[`encoding_rs`](https://crates.io/crates/encoding_rs)
5crate.
6
7Currently, this crate only provides a means of transcoding from a source
8encoding (that is among the encodings supported by `encoding_rs`) to UTF-8 via
9an implementation of `std::io::Read`, where errors are handled by replacing
10invalid sequences with the Unicode replacement character. Future work may
11provide additional implementations for `std::io::Write` and/or implementations
12that make stronger guarantees about UTF-8 validity.
13
14# Example
15
16This example shows how to create a decoder that transcodes UTF-16LE (the
17source) to UTF-8 (the destination).
18
19```
20extern crate encoding_rs;
21extern crate encoding_rs_io;
22
23use std::error::Error;
24use std::io::Read;
25
26use encoding_rs_io::DecodeReaderBytes;
27
28# fn main() { example().unwrap(); }
29fn example() -> Result<(), Box<Error>> {
30 let source_data = &b"\xFF\xFEf\x00o\x00o\x00b\x00a\x00r\x00"[..];
31 // N.B. `source_data` can be any arbitrary io::Read implementation.
32 let mut decoder = DecodeReaderBytes::new(source_data);
33
34 let mut dest = String::new();
35 // decoder implements the io::Read trait, so it can easily be plugged
36 // into any consumer expecting an arbitrary reader.
37 decoder.read_to_string(&mut dest)?;
38 assert_eq!(dest, "foobar");
39 Ok(())
40}
41```
42
43# Future work
44
45Currently, this crate only provides a way to get _possibly valid_ UTF-8 from
46some source encoding. There are other transformations that may be useful that
47we could include in this crate. Namely:
48
49* An encoder that accepts an arbitrary `std::io::Write` implementation and
50 takes valid UTF-8 and transcodes it to a selected destination encoding. This
51 encoder would implement `std::fmt::Write`.
52* A decoder that accepts an arbitrary `std::fmt::Write` implementation and
53 takes arbitrary bytes and transcodes them from a selected source
54 encoding to valid UTF-8. This decoder would implement `std::io::Write`.
55* An encoder that accepts an arbitrary `UnicodeRead` implementation and
56 takes valid UTF-8 and transcodes it to a selected destination encoding.
57 This encoder would implement `std::io::Read`.
58* A decoder that accepts an arbitrary `std::io::Read` implementation and
59 takes arbitrary bytes and transcodes them from a selected source encoding
60 to valid UTF-8. This decoder would implement the `UnicodeRead` trait.
61
62Where `UnicodeRead` is a hypothetical trait that does not yet exist. Its
63definition might look something like this:
64
65```ignore
66trait UnicodeRead {
67 fn read(&mut self, buf: &mut str) -> Result<usize>;
68}
69```
70
71Interestingly, of the above transformations, none of them correspond to
72`DecodeReaderBytes`. Namely, `DecodeReaderBytes` most closely corresponds to
73the last option, but instead of guaranteeing valid UTF-8 by implementing a
74trait like `UnicodeRead`, it instead implements `std::io::Read`, which pushes
75UTF-8 handling on to the caller. However, it turns out that this particular
76use case is important for operations like search, which can often be written
77in a way that don't assume UTF-8 validity but still benefit from it.
78
79It's not clear which of the above transformations is actually useful, but all
80of them could theoretically exist. There is more discussion on this topic
81here (and in particular, the above formulation was taken almost verbatim from
82Simon Sapin's comments): https://github.com/hsivonen/encoding_rs/issues/8
83
84It is also perhaps worth stating that this crate very much intends on
85remaining coupled to `encoding_rs`, which helps restrict the scope, but may be
86too biased toward Web oriented encoding to solve grander encoding challenges.
87As such, it may very well be that this crate is actually a stepping stone to
88something with a larger scope. But first, we must learn.
89*/
90
91extern crate encoding_rs;
92
93use std::fmt;
94use std::io::{self, Read};
95
96use encoding_rs::{Decoder, Encoding, UTF_8};
97
98use util::{BomPeeker, TinyTranscoder};
99
100mod util;
101
102/// A builder for constructing a byte oriented transcoder to UTF-8.
103#[derive(Clone, Debug)]
104pub struct DecodeReaderBytesBuilder {
105 encoding: Option<&'static Encoding>,
106 utf8_passthru: bool,
107 bom_override: bool,
108 strip_bom: bool,
109 bom_sniffing: bool,
110}
111
112impl Default for DecodeReaderBytesBuilder {
113 fn default() -> DecodeReaderBytesBuilder {
114 DecodeReaderBytesBuilder::new()
115 }
116}
117
118impl DecodeReaderBytesBuilder {
119 /// Create a new decoder builder with a default configuration.
120 ///
121 /// By default, no explicit encoding is used, but if a UTF-8 or UTF-16
122 /// BOM is detected, then an appropriate encoding is automatically
123 /// detected and transcoding is performed (where invalid sequences map to
124 /// the Unicode replacement codepoint).
125 pub fn new() -> DecodeReaderBytesBuilder {
126 DecodeReaderBytesBuilder {
127 encoding: None,
128 utf8_passthru: false,
129 bom_override: false,
130 strip_bom: false,
131 bom_sniffing: true,
132 }
133 }
134
135 /// Build a new decoder that wraps the given reader.
136 pub fn build<R: io::Read>(&self, rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
137 self.build_with_buffer(rdr, vec![0; 8 * (1 << 10)]).unwrap()
138 }
139
140 /// Build a new decoder that wraps the given reader and uses the given
141 /// buffer internally for transcoding.
142 ///
143 /// This is useful for cases where it is advantageuous to amortize
144 /// allocation. Namely, this method permits reusing a buffer for
145 /// subsequent decoders.
146 ///
147 /// This returns an error if the buffer is smaller than 4 bytes (which is
148 /// too small to hold maximum size of a single UTF-8 encoded codepoint).
149 pub fn build_with_buffer<R: io::Read, B: AsMut<[u8]>>(
150 &self,
151 rdr: R,
152 mut buffer: B,
153 ) -> io::Result<DecodeReaderBytes<R, B>> {
154 if buffer.as_mut().len() < 4 {
155 let msg = format!(
156 "DecodeReaderBytesBuilder: buffer of size {} is too small",
157 buffer.as_mut().len(),
158 );
159 return Err(io::Error::new(io::ErrorKind::Other, msg));
160 }
161 let encoding =
162 self.encoding.map(|enc| enc.new_decoder_with_bom_removal());
163
164 // No need to do BOM detection if we opt out of it or have an explicit
165 // encoding.
166 let has_detected =
167 !self.bom_sniffing || (!self.bom_override && encoding.is_some());
168
169 let peeker = if self.strip_bom {
170 BomPeeker::without_bom(rdr)
171 } else {
172 BomPeeker::with_bom(rdr)
173 };
174 Ok(DecodeReaderBytes {
175 rdr: peeker,
176 decoder: encoding,
177 tiny: TinyTranscoder::new(),
178 utf8_passthru: self.utf8_passthru,
179 buf: buffer,
180 buflen: 0,
181 pos: 0,
182 has_detected: has_detected,
183 exhausted: false,
184 })
185 }
186
187 /// Set an explicit encoding to be used by this decoder.
188 ///
189 /// When an explicit encoding is set, BOM sniffing is disabled and the
190 /// encoding provided will be used unconditionally. Errors in the encoded
191 /// bytes are replaced by the Unicode replacement codepoint.
192 ///
193 /// By default, no explicit encoding is set.
194 pub fn encoding(
195 &mut self,
196 encoding: Option<&'static Encoding>,
197 ) -> &mut DecodeReaderBytesBuilder {
198 self.encoding = encoding;
199 self
200 }
201
202 /// Enable UTF-8 passthru, even when a UTF-8 BOM is observed.
203 ///
204 /// When an explicit encoding is not set (thereby invoking automatic
205 /// encoding detection via BOM sniffing), then a UTF-8 BOM will cause
206 /// UTF-8 transcoding to occur. In particular, if the source contains
207 /// invalid UTF-8 sequences, then they are replaced with the Unicode
208 /// replacement codepoint.
209 ///
210 /// This transcoding may not be desirable. For example, the caller may
211 /// already have its own UTF-8 handling where invalid UTF-8 is
212 /// appropriately handled, in which case, doing an extra transcoding
213 /// step is extra and unnecessary work. Enabling this option will prevent
214 /// that extra transcoding step from occurring. In this case, the bytes
215 /// emitted by the reader are passed through unchanged (including the BOM)
216 /// and the caller will be responsible for handling any invalid UTF-8.
217 ///
218 /// # Example
219 ///
220 /// This example demonstrates the effect of enabling this option on data
221 /// that includes a UTF-8 BOM but also, interestingly enough, subsequently
222 /// includes invalid UTF-8.
223 ///
224 /// ```
225 /// extern crate encoding_rs;
226 /// extern crate encoding_rs_io;
227 ///
228 /// use std::error::Error;
229 /// use std::io::Read;
230 ///
231 /// use encoding_rs_io::DecodeReaderBytesBuilder;
232 ///
233 /// # fn main() { example().unwrap(); }
234 /// fn example() -> Result<(), Box<Error>> {
235 /// let source_data = &b"\xEF\xBB\xBFfoo\xFFbar"[..];
236 /// let mut decoder = DecodeReaderBytesBuilder::new()
237 /// .utf8_passthru(true)
238 /// .build(source_data);
239 ///
240 /// let mut dest = vec![];
241 /// decoder.read_to_end(&mut dest)?;
242 /// // Without the passthru option, you'd get "foo\u{FFFD}bar".
243 /// assert_eq!(dest, b"\xEF\xBB\xBFfoo\xFFbar");
244 /// Ok(())
245 /// }
246 /// ```
247 pub fn utf8_passthru(
248 &mut self,
249 yes: bool,
250 ) -> &mut DecodeReaderBytesBuilder {
251 self.utf8_passthru = yes;
252 self
253 }
254
255 /// Whether or not to always strip a BOM if one is found.
256 ///
257 /// When this is enabled, if a BOM is found at the beginning of a stream,
258 /// then it is ignored. This applies even when `utf8_passthru` is enabled
259 /// or if `bom_sniffing` is disabled.
260 ///
261 /// This is disabled by default.
262 ///
263 /// # Example
264 ///
265 /// This example shows how to remove the BOM if it's present even when
266 /// `utf8_passthru` is enabled.
267 ///
268 /// ```
269 /// extern crate encoding_rs;
270 /// extern crate encoding_rs_io;
271 ///
272 /// use std::error::Error;
273 /// use std::io::Read;
274 ///
275 /// use encoding_rs_io::DecodeReaderBytesBuilder;
276 ///
277 /// # fn main() { example().unwrap(); }
278 /// fn example() -> Result<(), Box<Error>> {
279 /// let source_data = &b"\xEF\xBB\xBFfoo\xFFbar"[..];
280 /// let mut decoder = DecodeReaderBytesBuilder::new()
281 /// .utf8_passthru(true)
282 /// .strip_bom(true)
283 /// .build(source_data);
284 ///
285 /// let mut dest = vec![];
286 /// decoder.read_to_end(&mut dest)?;
287 /// // If `strip_bom` wasn't enabled, then this would include the BOM.
288 /// assert_eq!(dest, b"foo\xFFbar");
289 /// Ok(())
290 /// }
291 /// ```
292 pub fn strip_bom(&mut self, yes: bool) -> &mut DecodeReaderBytesBuilder {
293 self.strip_bom = yes;
294 self
295 }
296
297 /// Give the highest precedent to the BOM, if one is found.
298 ///
299 /// When this is enabled, and if a BOM is found, then the encoding
300 /// indicated by that BOM is used even if an explicit encoding has been
301 /// set via the `encoding` method.
302 ///
303 /// This does not override `utf8_passthru`.
304 ///
305 /// This is disabled by default.
306 pub fn bom_override(
307 &mut self,
308 yes: bool,
309 ) -> &mut DecodeReaderBytesBuilder {
310 self.bom_override = yes;
311 self
312 }
313
314 /// Enable BOM sniffing
315 ///
316 /// When this is enabled and an explicit encoding is not set, the decoder
317 /// will try to detect the encoding with BOM.
318 ///
319 /// When this is disabled and an explicit encoding is not set, the decoder
320 /// will treat the input as raw bytes. The bytes will be passed through
321 /// unchanged, including any BOM that may be present.
322 ///
323 /// This is enabled by default.
324 pub fn bom_sniffing(
325 &mut self,
326 yes: bool,
327 ) -> &mut DecodeReaderBytesBuilder {
328 self.bom_sniffing = yes;
329 self
330 }
331}
332
333/// An implementation of `io::Read` that transcodes to UTF-8 in a streaming
334/// fashion.
335///
336/// The high level goal of this decoder is to provide access to byte streams
337/// that are assumed to be UTF-8 unless an encoding is otherwise specified
338/// (either via a BOM or via an explicit designation of an encoding).
339///
340/// When no explicit source encoding is specified (via
341/// `DecodeReaderBytesBuilder`), the source encoding is determined by
342/// inspecting the BOM from the stream read from `R`, if one exists. If a
343/// UTF-16 BOM exists, then the source stream is transcoded to UTF-8 with
344/// invalid UTF-16 sequences translated to the Unicode replacement character.
345/// Similarly if a UTF-8 BOM is seen. In all other cases, the source of the
346/// underlying reader is passed through unchanged _as if_ it were UTF-8.
347///
348/// Since this particular reader does not guarantee providing valid UTF-8 to
349/// the caller, the caller must be prepared to handle invalid UTF-8 itself.
350///
351/// `R` is the type of the underlying reader and `B` is the type of an internal
352/// buffer used to store the results of transcoding. Callers may elect to reuse
353/// the internal buffer via the `DecodeReaderBytesBuilder::build_with_buffer`
354/// constructor.
355pub struct DecodeReaderBytes<R, B> {
356 /// The underlying reader, wrapped in a peeker for reading a BOM if one
357 /// exists.
358 rdr: BomPeeker<R>,
359 /// The underlying text decoder derived from the BOM or an explicitly
360 /// specified encoding, if one exists.
361 decoder: Option<Decoder>,
362 /// A "tiny transcoder" for use when a caller provides a buffer that is
363 /// too small to write at least one UTF-8 encoded codepoint to.
364 tiny: TinyTranscoder,
365 /// When enabled, if a UTF-8 BOM is observed, then the bytes are passed
366 /// through from the underlying reader as-is instead of passing through
367 /// the UTF-8 transcoder (which will replace invalid sequences with the
368 /// REPLACEMENT CHARACTER).
369 utf8_passthru: bool,
370 /// The internal buffer to store transcoded bytes before they are read by
371 /// callers.
372 buf: B,
373 /// The current position in `buf`. Subsequent reads start here.
374 pos: usize,
375 /// The number of transcoded bytes in `buf`. Subsequent reads end here.
376 buflen: usize,
377 /// Whether BOM detection has been performed yet or not.
378 has_detected: bool,
379 /// Whether the underlying reader has been exhausted or not.
380 exhausted: bool,
381}
382
383impl<R: io::Read, B: AsMut<[u8]>> io::Read for DecodeReaderBytes<R, B> {
384 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
385 self.detect()?;
386 if self.decoder.is_none() {
387 self.rdr.read(buf)
388 } else {
389 self.transcode(buf)
390 }
391 }
392}
393
394impl<R: io::Read> DecodeReaderBytes<R, Vec<u8>> {
395 /// Create a new transcoder that converts a source stream to valid UTF-8
396 /// via BOM sniffing.
397 ///
398 /// To explicitly control the encoding, UTF-8 passthru or amortize
399 /// allocation, use the
400 /// [`DecodeReaderBytesBuilder`](struct.DecodeReaderBytesBuilder.html)
401 /// constructor.
402 ///
403 /// When a BOM is found (which must correspond to UTF-8, UTF-16LE or
404 /// UTF-16BE), then transcoding to UTF-8 is performed and any invalid
405 /// sequences in the source data are seamlessly replaced by the Unicode
406 /// replacement character.
407 ///
408 /// When no BOM is found (and no other encoding is specified via the
409 /// builder), the underlying bytes are passed through as-is.
410 pub fn new(rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
411 DecodeReaderBytesBuilder::new().build(rdr)
412 }
413}
414
415impl<R: io::Read, B: AsMut<[u8]>> DecodeReaderBytes<R, B> {
416 /// Transcode the inner stream to UTF-8 in `buf`. This assumes that there
417 /// is a decoder capable of transcoding the inner stream to UTF-8. This
418 /// returns the number of bytes written to `buf`.
419 ///
420 /// When this function returns, exactly one of the following things will
421 /// be true:
422 ///
423 /// 1. A non-zero number of bytes were written to `buf`.
424 /// 2. The underlying reader reached EOF (or `buf` is empty).
425 /// 3. An error is returned: the internal buffer ran out of room.
426 /// 4. An I/O error occurred.
427 fn transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
428 if self.exhausted || buf.is_empty() {
429 return Ok(0);
430 }
431 let nwrite = self.tiny.read(buf)?;
432 if nwrite > 0 {
433 // We could technically mush on if the caller provided buffer is
434 // big enough, but to keep things we simple, we satisfy the
435 // contract and quit.
436 return Ok(nwrite);
437 }
438 if self.pos >= self.buflen {
439 self.fill()?;
440 }
441 if buf.len() < 4 {
442 return self.tiny_transcode(buf);
443 }
444 loop {
445 let (_, nin, nout, _) =
446 self.decoder.as_mut().unwrap().decode_to_utf8(
447 &self.buf.as_mut()[self.pos..self.buflen],
448 buf,
449 false,
450 );
451 self.pos += nin;
452 // If we've written at least one byte to the caller-provided
453 // buffer, then our mission is complete.
454 if nout > 0 {
455 return Ok(nout);
456 }
457 // Otherwise, we know that our internal buffer has insufficient
458 // data to transcode at least one char, so we attempt to refill it.
459 self.fill()?;
460 // ... but quit on EOF.
461 if self.buflen == 0 {
462 let (_, _, nout, _) = self
463 .decoder
464 .as_mut()
465 .unwrap()
466 .decode_to_utf8(&[], buf, true);
467 return Ok(nout);
468 }
469 }
470 }
471
472 /// Like transcode, but deals with the case where the caller provided
473 /// buffer is less than 4.
474 fn tiny_transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
475 assert!(buf.len() < 4, "have a small caller buffer");
476 loop {
477 let (nin, nout) = self.tiny.transcode(
478 self.decoder.as_mut().unwrap(),
479 &self.buf.as_mut()[self.pos..self.buflen],
480 false,
481 );
482 self.pos += nin;
483 if nout > 0 {
484 // We've satisfied the contract of writing at least one byte,
485 // so we're done. The tiny transcoder is guaranteed to yield
486 // a non-zero number of bytes.
487 return self.tiny.read(buf);
488 }
489 // Otherwise, we know that our internal buffer has insufficient
490 // data to transcode at least one char, so we attempt to refill it.
491 self.fill()?;
492 // ... but quit on EOF.
493 if self.buflen == 0 {
494 self.tiny.transcode(self.decoder.as_mut().unwrap(), &[], true);
495 return self.tiny.read(buf);
496 }
497 }
498 }
499
500 /// Peeks at the underlying reader to look for a BOM. If one exists, then
501 /// an appropriate decoder is created corresponding to the detected BOM.
502 fn detect(&mut self) -> io::Result<()> {
503 if self.has_detected {
504 return Ok(());
505 }
506 self.has_detected = true;
507 let bom = self.rdr.peek_bom()?;
508 if let Some(encoding) = bom.encoding() {
509 // If we got a UTF-8 BOM, and the decoder was configured for
510 // passing through UTF-8, then don't build a decoder at all.
511 if encoding == UTF_8 && self.utf8_passthru {
512 return Ok(());
513 }
514 self.decoder = Some(encoding.new_decoder_with_bom_removal());
515 }
516 Ok(())
517 }
518
519 /// Fill the internal buffer from the underlying reader.
520 ///
521 /// If there are unread bytes in the internal buffer, then we move them
522 /// to the beginning of the internal buffer and fill the remainder.
523 ///
524 /// If the internal buffer is too small to read additional bytes, then an
525 /// error is returned.
526 fn fill(&mut self) -> io::Result<()> {
527 if self.pos < self.buflen {
528 // Despite my best efforts, I could not seem to actually exercise
529 // this code path in tests. Namely, this code path occurs when the
530 // decoder can't make any progress and also doesn't consume all of
531 // the input. Since I'm not sure how to trigger that case, this
532 // code path is actually untested!
533
534 // We can assert this because we require that the caller provided
535 // buffer be at least 4 bytes big.
536 assert!(
537 self.buflen < self.buf.as_mut().len(),
538 "internal buffer should never be exhausted"
539 );
540 let buf = self.buf.as_mut();
541 for (dst, src) in (self.pos..self.buflen).enumerate() {
542 buf[dst] = buf[src];
543 }
544 self.buflen -= self.pos;
545 } else {
546 self.buflen = 0;
547 }
548 self.pos = 0;
549 self.buflen += self.rdr.read(&mut self.buf.as_mut()[self.buflen..])?;
550 if self.buflen == 0 {
551 self.exhausted = true;
552 }
553 Ok(())
554 }
555}
556
557impl<R: fmt::Debug, B: fmt::Debug> fmt::Debug for DecodeReaderBytes<R, B> {
558 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
559 let mut fmter = f.debug_struct("DecodeReaderBytes");
560 fmter
561 .field("rdr", &self.rdr)
562 .field("tiny", &self.tiny)
563 .field("utf8_passthru", &self.utf8_passthru)
564 .field("buf", &self.buf)
565 .field("pos", &self.pos)
566 .field("buflen", &self.buflen)
567 .field("has_detected", &self.has_detected)
568 .field("exhausted", &self.exhausted);
569 // Because `encoding_rs::Decoder` doesn't impl `fmt::Debug`.
570 if let Some(ref d) = self.decoder {
571 let msg = format!("Some(<Decoder for {}>)", d.encoding().name());
572 fmter.field("decoder", &msg);
573 } else {
574 fmter.field("decoder", &"None");
575 }
576 fmter.finish()
577 }
578}
579
580#[cfg(test)]
581mod tests {
582 use std::io::Read;
583
584 use encoding_rs::{self, Encoding};
585
586 use super::{DecodeReaderBytes, DecodeReaderBytesBuilder};
587
588 fn read_to_string<R: Read>(mut rdr: R) -> String {
589 let mut s = String::new();
590 rdr.read_to_string(&mut s).unwrap();
591 s
592 }
593
594 // In cases where all we have is a bom, we expect the bytes to be
595 // passed through unchanged.
596 #[test]
597 fn trans_utf16_bom() {
598 let srcbuf = vec![0xFF, 0xFE];
599 let mut dstbuf = vec![0; 8 * (1 << 10)];
600 let mut rdr = DecodeReaderBytes::new(&*srcbuf);
601 let n = rdr.read(&mut dstbuf).unwrap();
602 assert_eq!(&*srcbuf, &dstbuf[..n]);
603
604 let srcbuf = vec![0xFE, 0xFF];
605 let mut rdr = DecodeReaderBytes::new(&*srcbuf);
606 let n = rdr.read(&mut dstbuf).unwrap();
607 assert_eq!(&*srcbuf, &dstbuf[..n]);
608
609 let srcbuf = vec![0xEF, 0xBB, 0xBF];
610 let mut rdr = DecodeReaderBytes::new(&*srcbuf);
611 let n = rdr.read(&mut dstbuf).unwrap();
612 assert_eq!(n, 0);
613
614 let srcbuf = vec![0xEF, 0xBB, 0xBF];
615 let mut rdr = DecodeReaderBytesBuilder::new()
616 .utf8_passthru(true)
617 .build(&*srcbuf);
618 let n = rdr.read(&mut dstbuf).unwrap();
619 assert_eq!(&*srcbuf, &dstbuf[..n]);
620 }
621
622 // Test basic UTF-16 decoding.
623 #[test]
624 fn trans_utf16_basic() {
625 let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
626 let mut rdr = DecodeReaderBytes::new(&*srcbuf);
627 assert_eq!("a", read_to_string(&mut rdr));
628
629 let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61];
630 let mut rdr = DecodeReaderBytes::new(&*srcbuf);
631 assert_eq!("a", read_to_string(&mut rdr));
632 }
633
634 #[test]
635 fn trans_utf16_basic_without_bom() {
636 let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
637 let mut rdr =
638 DecodeReaderBytesBuilder::new().strip_bom(true).build(&*srcbuf);
639 assert_eq!("a", read_to_string(&mut rdr));
640
641 let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61];
642 let mut rdr =
643 DecodeReaderBytesBuilder::new().strip_bom(true).build(&*srcbuf);
644 assert_eq!("a", read_to_string(&mut rdr));
645 }
646
647 // Test the BOM override.
648 #[test]
649 fn trans_utf16_bom_override() {
650 let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
651 let mut rdr = DecodeReaderBytesBuilder::new()
652 .bom_override(true)
653 .encoding(Some(encoding_rs::UTF_8))
654 .build(&*srcbuf);
655 assert_eq!("a", read_to_string(&mut rdr));
656 }
657
658 // Test basic UTF-16 decoding with a small buffer.
659 #[test]
660 fn trans_utf16_smallbuf() {
661 let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00];
662 let mut rdr = DecodeReaderBytes::new(&*srcbuf);
663 let mut tmp = [0u8; 1];
664
665 let nread = rdr.read(&mut tmp).unwrap();
666 assert_eq!(nread, 1);
667 assert_eq!(tmp, [b'a'; 1]);
668
669 let nread = rdr.read(&mut tmp).unwrap();
670 assert_eq!(nread, 1);
671 assert_eq!(tmp, [b'b'; 1]);
672
673 let nread = rdr.read(&mut tmp).unwrap();
674 assert_eq!(nread, 1);
675 assert_eq!(tmp, [b'c'; 1]);
676
677 let nread = rdr.read(&mut tmp).unwrap();
678 assert_eq!(nread, 0);
679 }
680
681 // Test incomplete UTF-16 decoding. This ensures we see a replacement char
682 // if the stream ends with an unpaired code unit.
683 #[test]
684 fn trans_utf16_incomplete() {
685 let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x00];
686 let mut rdr = DecodeReaderBytes::new(&*srcbuf);
687 assert_eq!("a\u{FFFD}", read_to_string(&mut rdr));
688 }
689
690 // Test transcoding with a minimal buffer but a large caller buffer.
691 #[test]
692 fn trans_utf16_minimal_buffer_normal_caller_buffer() {
693 #[rustfmt::skip]
694 let srcbuf = vec![
695 0xFF, 0xFE,
696 0x61, 0x00,
697 0x62, 0x00,
698 0x63, 0x00,
699 0x64, 0x00,
700 0x65, 0x00,
701 0x66, 0x00,
702 0x67, 0x00,
703 0x68, 0x00,
704 ];
705 let mut rdr = DecodeReaderBytesBuilder::new()
706 .build_with_buffer(&*srcbuf, vec![0; 4])
707 .unwrap();
708 let got = read_to_string(&mut rdr);
709 assert_eq!(got, "abcdefgh");
710 }
711
712 // Test transcoding with a minimal buffer and a minimal caller buffer.
713 #[test]
714 fn trans_utf16_minimal_buffers() {
715 let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00];
716 let mut rdr = DecodeReaderBytesBuilder::new()
717 .build_with_buffer(&*srcbuf, vec![0; 4])
718 .unwrap();
719 let mut tmp = [0u8; 1];
720
721 let nread = rdr.read(&mut tmp).unwrap();
722 assert_eq!(nread, 1);
723 assert_eq!(tmp, [b'a'; 1]);
724
725 let nread = rdr.read(&mut tmp).unwrap();
726 assert_eq!(nread, 1);
727 assert_eq!(tmp, [b'b'; 1]);
728
729 let nread = rdr.read(&mut tmp).unwrap();
730 assert_eq!(nread, 1);
731 assert_eq!(tmp, [b'c'; 1]);
732
733 let nread = rdr.read(&mut tmp).unwrap();
734 assert_eq!(nread, 0);
735 }
736
737 // Test transcoding with using byte oriented APIs.
738 #[test]
739 fn trans_utf16_byte_api() {
740 #[rustfmt::skip]
741 let srcbuf = vec![
742 0xFF, 0xFE,
743 0x61, 0x00,
744 0x62, 0x00,
745 0x63, 0x00,
746 0x64, 0x00,
747 0x65, 0x00,
748 0x66, 0x00,
749 0x67, 0x00,
750 0x68, 0x00,
751 ];
752 let rdr = DecodeReaderBytes::new(&*srcbuf);
753 let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
754 assert_eq!(got, b"abcdefgh");
755 }
756
757 #[test]
758 fn trans_utf16_no_sniffing() {
759 #[rustfmt::skip]
760 let srcbuf = vec![
761 0xFF, 0xFE,
762 0x61, 0x00,
763 ];
764 let rdr = DecodeReaderBytesBuilder::new()
765 .bom_sniffing(false)
766 .build(&*srcbuf);
767 let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
768 assert_eq!(got, srcbuf);
769 }
770
771 #[test]
772 fn trans_utf16_no_sniffing_strip_bom() {
773 #[rustfmt::skip]
774 let srcbuf = vec![
775 0xFF, 0xFE,
776 0x61, 0x00,
777 ];
778 let rdr = DecodeReaderBytesBuilder::new()
779 .bom_sniffing(false)
780 .strip_bom(true)
781 .build(&*srcbuf);
782 let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
783 assert_eq!(got, &[0x61, 0x00]);
784 }
785
786 #[test]
787 fn trans_utf16_no_sniffing_encoding_override() {
788 #[rustfmt::skip]
789 let srcbuf = vec![
790 0xFF, 0xFE,
791 0x61, 0x00,
792 ];
793 let rdr = DecodeReaderBytesBuilder::new()
794 .bom_sniffing(false)
795 .encoding(Some(encoding_rs::UTF_16LE))
796 .build(&*srcbuf);
797 let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
798 assert_eq!(got, b"a");
799 }
800
801 #[test]
802 fn trans_utf16_no_sniffing_encoding_override_strip_bom() {
803 #[rustfmt::skip]
804 let srcbuf = vec![
805 0xFF, 0xFE,
806 0x61, 0x00,
807 ];
808 let rdr = DecodeReaderBytesBuilder::new()
809 .bom_sniffing(false)
810 .strip_bom(true)
811 .encoding(Some(encoding_rs::UTF_16LE))
812 .build(&*srcbuf);
813 let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
814 assert_eq!(got, b"a");
815 }
816
817 // Test transcoding with a minimal buffer using byte oriented APIs.
818 #[test]
819 fn trans_utf16_minimal_buffer_byte_api() {
820 #[rustfmt::skip]
821 let srcbuf = vec![
822 0xFF, 0xFE,
823 0x61, 0x00,
824 0x62, 0x00,
825 0x63, 0x00,
826 0x64, 0x00,
827 0x65, 0x00,
828 0x66, 0x00,
829 0x67, 0x00,
830 0x68, 0x00,
831 ];
832 let rdr = DecodeReaderBytesBuilder::new()
833 .build_with_buffer(&*srcbuf, vec![0; 4])
834 .unwrap();
835 let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
836 assert_eq!(got, b"abcdefgh");
837 }
838
839 // Test a buffer that is too small.
840 #[test]
841 fn buffer_too_small() {
842 let res = DecodeReaderBytesBuilder::new()
843 .build_with_buffer(&[][..], vec![0; 3]);
844 assert!(res.is_err());
845 }
846
847 macro_rules! test_trans_simple {
848 ($name:ident, $enc:expr, $srcbytes:expr, $dst:expr) => {
849 #[test]
850 fn $name() {
851 let srcbuf = &$srcbytes[..];
852 let enc = Encoding::for_label($enc.as_bytes());
853 let mut rdr = DecodeReaderBytesBuilder::new()
854 .encoding(enc)
855 .build(&*srcbuf);
856 assert_eq!($dst, read_to_string(&mut rdr));
857 }
858 };
859 }
860
861 // This isn't exhaustive obviously, but it lets us test base level support.
862 test_trans_simple!(trans_simple_auto, "does not exist", b"\xD0\x96", "Ж");
863 test_trans_simple!(trans_simple_utf8, "utf-8", b"\xD0\x96", "Ж");
864 test_trans_simple!(trans_simple_utf16le, "utf-16le", b"\x16\x04", "Ж");
865 test_trans_simple!(trans_simple_utf16be, "utf-16be", b"\x04\x16", "Ж");
866 test_trans_simple!(trans_simple_chinese, "chinese", b"\xA7\xA8", "Ж");
867 test_trans_simple!(trans_simple_korean, "korean", b"\xAC\xA8", "Ж");
868 test_trans_simple!(
869 trans_simple_big5_hkscs,
870 "big5-hkscs",
871 b"\xC7\xFA",
872 "Ж"
873 );
874 test_trans_simple!(trans_simple_gbk, "gbk", b"\xA7\xA8", "Ж");
875 test_trans_simple!(trans_simple_sjis, "sjis", b"\x84\x47", "Ж");
876 test_trans_simple!(trans_simple_eucjp, "euc-jp", b"\xA7\xA8", "Ж");
877 test_trans_simple!(trans_simple_latin1, "latin1", b"\xA9", "©");
878}