encoding_rs_io/
lib.rs

1/*!
2This crate provides streaming transcoding by implementing Rust's I/O traits
3and delegating transcoding to the
4[`encoding_rs`](https://crates.io/crates/encoding_rs)
5crate.
6
7Currently, this crate only provides a means of transcoding from a source
8encoding (that is among the encodings supported by `encoding_rs`) to UTF-8 via
9an implementation of `std::io::Read`, where errors are handled by replacing
10invalid sequences with the Unicode replacement character. Future work may
11provide additional implementations for `std::io::Write` and/or implementations
12that make stronger guarantees about UTF-8 validity.
13
14# Example
15
16This example shows how to create a decoder that transcodes UTF-16LE (the
17source) to UTF-8 (the destination).
18
19```
20extern crate encoding_rs;
21extern crate encoding_rs_io;
22
23use std::error::Error;
24use std::io::Read;
25
26use encoding_rs_io::DecodeReaderBytes;
27
28# fn main() { example().unwrap(); }
29fn example() -> Result<(), Box<Error>> {
30    let source_data = &b"\xFF\xFEf\x00o\x00o\x00b\x00a\x00r\x00"[..];
31    // N.B. `source_data` can be any arbitrary io::Read implementation.
32    let mut decoder = DecodeReaderBytes::new(source_data);
33
34    let mut dest = String::new();
35    // decoder implements the io::Read trait, so it can easily be plugged
36    // into any consumer expecting an arbitrary reader.
37    decoder.read_to_string(&mut dest)?;
38    assert_eq!(dest, "foobar");
39    Ok(())
40}
41```
42
43# Future work
44
45Currently, this crate only provides a way to get _possibly valid_ UTF-8 from
46some source encoding. There are other transformations that may be useful that
47we could include in this crate. Namely:
48
49* An encoder that accepts an arbitrary `std::io::Write` implementation and
50  takes valid UTF-8 and transcodes it to a selected destination encoding. This
51  encoder would implement `std::fmt::Write`.
52* A decoder that accepts an arbitrary `std::fmt::Write` implementation and
53  takes arbitrary bytes and transcodes them from a selected source
54  encoding to valid UTF-8. This decoder would implement `std::io::Write`.
55* An encoder that accepts an arbitrary `UnicodeRead` implementation and
56  takes valid UTF-8 and transcodes it to a selected destination encoding.
57  This encoder would implement `std::io::Read`.
58* A decoder that accepts an arbitrary `std::io::Read` implementation and
59  takes arbitrary bytes and transcodes them from a selected source encoding
60  to valid UTF-8. This decoder would implement the `UnicodeRead` trait.
61
62Where `UnicodeRead` is a hypothetical trait that does not yet exist. Its
63definition might look something like this:
64
65```ignore
66trait UnicodeRead {
67    fn read(&mut self, buf: &mut str) -> Result<usize>;
68}
69```
70
71Interestingly, of the above transformations, none of them correspond to
72`DecodeReaderBytes`. Namely, `DecodeReaderBytes` most closely corresponds to
73the last option, but instead of guaranteeing valid UTF-8 by implementing a
74trait like `UnicodeRead`, it instead implements `std::io::Read`, which pushes
75UTF-8 handling on to the caller. However, it turns out that this particular
76use case is important for operations like search, which can often be written
77in a way that don't assume UTF-8 validity but still benefit from it.
78
79It's not clear which of the above transformations is actually useful, but all
80of them could theoretically exist. There is more discussion on this topic
81here (and in particular, the above formulation was taken almost verbatim from
82Simon Sapin's comments): https://github.com/hsivonen/encoding_rs/issues/8
83
84It is also perhaps worth stating that this crate very much intends on
85remaining coupled to `encoding_rs`, which helps restrict the scope, but may be
86too biased toward Web oriented encoding to solve grander encoding challenges.
87As such, it may very well be that this crate is actually a stepping stone to
88something with a larger scope. But first, we must learn.
89*/
90
91extern crate encoding_rs;
92
93use std::fmt;
94use std::io::{self, Read};
95
96use encoding_rs::{Decoder, Encoding, UTF_8};
97
98use util::{BomPeeker, TinyTranscoder};
99
100mod util;
101
102/// A builder for constructing a byte oriented transcoder to UTF-8.
103#[derive(Clone, Debug)]
104pub struct DecodeReaderBytesBuilder {
105    encoding: Option<&'static Encoding>,
106    utf8_passthru: bool,
107    bom_override: bool,
108    strip_bom: bool,
109    bom_sniffing: bool,
110}
111
112impl Default for DecodeReaderBytesBuilder {
113    fn default() -> DecodeReaderBytesBuilder {
114        DecodeReaderBytesBuilder::new()
115    }
116}
117
118impl DecodeReaderBytesBuilder {
119    /// Create a new decoder builder with a default configuration.
120    ///
121    /// By default, no explicit encoding is used, but if a UTF-8 or UTF-16
122    /// BOM is detected, then an appropriate encoding is automatically
123    /// detected and transcoding is performed (where invalid sequences map to
124    /// the Unicode replacement codepoint).
125    pub fn new() -> DecodeReaderBytesBuilder {
126        DecodeReaderBytesBuilder {
127            encoding: None,
128            utf8_passthru: false,
129            bom_override: false,
130            strip_bom: false,
131            bom_sniffing: true,
132        }
133    }
134
135    /// Build a new decoder that wraps the given reader.
136    pub fn build<R: io::Read>(&self, rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
137        self.build_with_buffer(rdr, vec![0; 8 * (1 << 10)]).unwrap()
138    }
139
140    /// Build a new decoder that wraps the given reader and uses the given
141    /// buffer internally for transcoding.
142    ///
143    /// This is useful for cases where it is advantageuous to amortize
144    /// allocation. Namely, this method permits reusing a buffer for
145    /// subsequent decoders.
146    ///
147    /// This returns an error if the buffer is smaller than 4 bytes (which is
148    /// too small to hold maximum size of a single UTF-8 encoded codepoint).
149    pub fn build_with_buffer<R: io::Read, B: AsMut<[u8]>>(
150        &self,
151        rdr: R,
152        mut buffer: B,
153    ) -> io::Result<DecodeReaderBytes<R, B>> {
154        if buffer.as_mut().len() < 4 {
155            let msg = format!(
156                "DecodeReaderBytesBuilder: buffer of size {} is too small",
157                buffer.as_mut().len(),
158            );
159            return Err(io::Error::new(io::ErrorKind::Other, msg));
160        }
161        let encoding =
162            self.encoding.map(|enc| enc.new_decoder_with_bom_removal());
163
164        // No need to do BOM detection if we opt out of it or have an explicit
165        // encoding.
166        let has_detected =
167            !self.bom_sniffing || (!self.bom_override && encoding.is_some());
168
169        let peeker = if self.strip_bom {
170            BomPeeker::without_bom(rdr)
171        } else {
172            BomPeeker::with_bom(rdr)
173        };
174        Ok(DecodeReaderBytes {
175            rdr: peeker,
176            decoder: encoding,
177            tiny: TinyTranscoder::new(),
178            utf8_passthru: self.utf8_passthru,
179            buf: buffer,
180            buflen: 0,
181            pos: 0,
182            has_detected: has_detected,
183            exhausted: false,
184        })
185    }
186
187    /// Set an explicit encoding to be used by this decoder.
188    ///
189    /// When an explicit encoding is set, BOM sniffing is disabled and the
190    /// encoding provided will be used unconditionally. Errors in the encoded
191    /// bytes are replaced by the Unicode replacement codepoint.
192    ///
193    /// By default, no explicit encoding is set.
194    pub fn encoding(
195        &mut self,
196        encoding: Option<&'static Encoding>,
197    ) -> &mut DecodeReaderBytesBuilder {
198        self.encoding = encoding;
199        self
200    }
201
202    /// Enable UTF-8 passthru, even when a UTF-8 BOM is observed.
203    ///
204    /// When an explicit encoding is not set (thereby invoking automatic
205    /// encoding detection via BOM sniffing), then a UTF-8 BOM will cause
206    /// UTF-8 transcoding to occur. In particular, if the source contains
207    /// invalid UTF-8 sequences, then they are replaced with the Unicode
208    /// replacement codepoint.
209    ///
210    /// This transcoding may not be desirable. For example, the caller may
211    /// already have its own UTF-8 handling where invalid UTF-8 is
212    /// appropriately handled, in which case, doing an extra transcoding
213    /// step is extra and unnecessary work. Enabling this option will prevent
214    /// that extra transcoding step from occurring. In this case, the bytes
215    /// emitted by the reader are passed through unchanged (including the BOM)
216    /// and the caller will be responsible for handling any invalid UTF-8.
217    ///
218    /// # Example
219    ///
220    /// This example demonstrates the effect of enabling this option on data
221    /// that includes a UTF-8 BOM but also, interestingly enough, subsequently
222    /// includes invalid UTF-8.
223    ///
224    /// ```
225    /// extern crate encoding_rs;
226    /// extern crate encoding_rs_io;
227    ///
228    /// use std::error::Error;
229    /// use std::io::Read;
230    ///
231    /// use encoding_rs_io::DecodeReaderBytesBuilder;
232    ///
233    /// # fn main() { example().unwrap(); }
234    /// fn example() -> Result<(), Box<Error>> {
235    ///     let source_data = &b"\xEF\xBB\xBFfoo\xFFbar"[..];
236    ///     let mut decoder = DecodeReaderBytesBuilder::new()
237    ///         .utf8_passthru(true)
238    ///         .build(source_data);
239    ///
240    ///     let mut dest = vec![];
241    ///     decoder.read_to_end(&mut dest)?;
242    ///     // Without the passthru option, you'd get "foo\u{FFFD}bar".
243    ///     assert_eq!(dest, b"\xEF\xBB\xBFfoo\xFFbar");
244    ///     Ok(())
245    /// }
246    /// ```
247    pub fn utf8_passthru(
248        &mut self,
249        yes: bool,
250    ) -> &mut DecodeReaderBytesBuilder {
251        self.utf8_passthru = yes;
252        self
253    }
254
255    /// Whether or not to always strip a BOM if one is found.
256    ///
257    /// When this is enabled, if a BOM is found at the beginning of a stream,
258    /// then it is ignored. This applies even when `utf8_passthru` is enabled
259    /// or if `bom_sniffing` is disabled.
260    ///
261    /// This is disabled by default.
262    ///
263    /// # Example
264    ///
265    /// This example shows how to remove the BOM if it's present even when
266    /// `utf8_passthru` is enabled.
267    ///
268    /// ```
269    /// extern crate encoding_rs;
270    /// extern crate encoding_rs_io;
271    ///
272    /// use std::error::Error;
273    /// use std::io::Read;
274    ///
275    /// use encoding_rs_io::DecodeReaderBytesBuilder;
276    ///
277    /// # fn main() { example().unwrap(); }
278    /// fn example() -> Result<(), Box<Error>> {
279    ///     let source_data = &b"\xEF\xBB\xBFfoo\xFFbar"[..];
280    ///     let mut decoder = DecodeReaderBytesBuilder::new()
281    ///         .utf8_passthru(true)
282    ///         .strip_bom(true)
283    ///         .build(source_data);
284    ///
285    ///     let mut dest = vec![];
286    ///     decoder.read_to_end(&mut dest)?;
287    ///     // If `strip_bom` wasn't enabled, then this would include the BOM.
288    ///     assert_eq!(dest, b"foo\xFFbar");
289    ///     Ok(())
290    /// }
291    /// ```
292    pub fn strip_bom(&mut self, yes: bool) -> &mut DecodeReaderBytesBuilder {
293        self.strip_bom = yes;
294        self
295    }
296
297    /// Give the highest precedent to the BOM, if one is found.
298    ///
299    /// When this is enabled, and if a BOM is found, then the encoding
300    /// indicated by that BOM is used even if an explicit encoding has been
301    /// set via the `encoding` method.
302    ///
303    /// This does not override `utf8_passthru`.
304    ///
305    /// This is disabled by default.
306    pub fn bom_override(
307        &mut self,
308        yes: bool,
309    ) -> &mut DecodeReaderBytesBuilder {
310        self.bom_override = yes;
311        self
312    }
313
314    /// Enable BOM sniffing
315    ///
316    /// When this is enabled and an explicit encoding is not set, the decoder
317    /// will try to detect the encoding with BOM.
318    ///
319    /// When this is disabled and an explicit encoding is not set, the decoder
320    /// will treat the input as raw bytes. The bytes will be passed through
321    /// unchanged, including any BOM that may be present.
322    ///
323    /// This is enabled by default.
324    pub fn bom_sniffing(
325        &mut self,
326        yes: bool,
327    ) -> &mut DecodeReaderBytesBuilder {
328        self.bom_sniffing = yes;
329        self
330    }
331}
332
333/// An implementation of `io::Read` that transcodes to UTF-8 in a streaming
334/// fashion.
335///
336/// The high level goal of this decoder is to provide access to byte streams
337/// that are assumed to be UTF-8 unless an encoding is otherwise specified
338/// (either via a BOM or via an explicit designation of an encoding).
339///
340/// When no explicit source encoding is specified (via
341/// `DecodeReaderBytesBuilder`), the source encoding is determined by
342/// inspecting the BOM from the stream read from `R`, if one exists. If a
343/// UTF-16 BOM exists, then the source stream is transcoded to UTF-8 with
344/// invalid UTF-16 sequences translated to the Unicode replacement character.
345/// Similarly if a UTF-8 BOM is seen. In all other cases, the source of the
346/// underlying reader is passed through unchanged _as if_ it were UTF-8.
347///
348/// Since this particular reader does not guarantee providing valid UTF-8 to
349/// the caller, the caller must be prepared to handle invalid UTF-8 itself.
350///
351/// `R` is the type of the underlying reader and `B` is the type of an internal
352/// buffer used to store the results of transcoding. Callers may elect to reuse
353/// the internal buffer via the `DecodeReaderBytesBuilder::build_with_buffer`
354/// constructor.
355pub struct DecodeReaderBytes<R, B> {
356    /// The underlying reader, wrapped in a peeker for reading a BOM if one
357    /// exists.
358    rdr: BomPeeker<R>,
359    /// The underlying text decoder derived from the BOM or an explicitly
360    /// specified encoding, if one exists.
361    decoder: Option<Decoder>,
362    /// A "tiny transcoder" for use when a caller provides a buffer that is
363    /// too small to write at least one UTF-8 encoded codepoint to.
364    tiny: TinyTranscoder,
365    /// When enabled, if a UTF-8 BOM is observed, then the bytes are passed
366    /// through from the underlying reader as-is instead of passing through
367    /// the UTF-8 transcoder (which will replace invalid sequences with the
368    /// REPLACEMENT CHARACTER).
369    utf8_passthru: bool,
370    /// The internal buffer to store transcoded bytes before they are read by
371    /// callers.
372    buf: B,
373    /// The current position in `buf`. Subsequent reads start here.
374    pos: usize,
375    /// The number of transcoded bytes in `buf`. Subsequent reads end here.
376    buflen: usize,
377    /// Whether BOM detection has been performed yet or not.
378    has_detected: bool,
379    /// Whether the underlying reader has been exhausted or not.
380    exhausted: bool,
381}
382
383impl<R: io::Read, B: AsMut<[u8]>> io::Read for DecodeReaderBytes<R, B> {
384    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
385        self.detect()?;
386        if self.decoder.is_none() {
387            self.rdr.read(buf)
388        } else {
389            self.transcode(buf)
390        }
391    }
392}
393
394impl<R: io::Read> DecodeReaderBytes<R, Vec<u8>> {
395    /// Create a new transcoder that converts a source stream to valid UTF-8
396    /// via BOM sniffing.
397    ///
398    /// To explicitly control the encoding, UTF-8 passthru or amortize
399    /// allocation, use the
400    /// [`DecodeReaderBytesBuilder`](struct.DecodeReaderBytesBuilder.html)
401    /// constructor.
402    ///
403    /// When a BOM is found (which must correspond to UTF-8, UTF-16LE or
404    /// UTF-16BE), then transcoding to UTF-8 is performed and any invalid
405    /// sequences in the source data are seamlessly replaced by the Unicode
406    /// replacement character.
407    ///
408    /// When no BOM is found (and no other encoding is specified via the
409    /// builder), the underlying bytes are passed through as-is.
410    pub fn new(rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
411        DecodeReaderBytesBuilder::new().build(rdr)
412    }
413}
414
415impl<R: io::Read, B: AsMut<[u8]>> DecodeReaderBytes<R, B> {
416    /// Transcode the inner stream to UTF-8 in `buf`. This assumes that there
417    /// is a decoder capable of transcoding the inner stream to UTF-8. This
418    /// returns the number of bytes written to `buf`.
419    ///
420    /// When this function returns, exactly one of the following things will
421    /// be true:
422    ///
423    /// 1. A non-zero number of bytes were written to `buf`.
424    /// 2. The underlying reader reached EOF (or `buf` is empty).
425    /// 3. An error is returned: the internal buffer ran out of room.
426    /// 4. An I/O error occurred.
427    fn transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
428        if self.exhausted || buf.is_empty() {
429            return Ok(0);
430        }
431        let nwrite = self.tiny.read(buf)?;
432        if nwrite > 0 {
433            // We could technically mush on if the caller provided buffer is
434            // big enough, but to keep things we simple, we satisfy the
435            // contract and quit.
436            return Ok(nwrite);
437        }
438        if self.pos >= self.buflen {
439            self.fill()?;
440        }
441        if buf.len() < 4 {
442            return self.tiny_transcode(buf);
443        }
444        loop {
445            let (_, nin, nout, _) =
446                self.decoder.as_mut().unwrap().decode_to_utf8(
447                    &self.buf.as_mut()[self.pos..self.buflen],
448                    buf,
449                    false,
450                );
451            self.pos += nin;
452            // If we've written at least one byte to the caller-provided
453            // buffer, then our mission is complete.
454            if nout > 0 {
455                return Ok(nout);
456            }
457            // Otherwise, we know that our internal buffer has insufficient
458            // data to transcode at least one char, so we attempt to refill it.
459            self.fill()?;
460            // ... but quit on EOF.
461            if self.buflen == 0 {
462                let (_, _, nout, _) = self
463                    .decoder
464                    .as_mut()
465                    .unwrap()
466                    .decode_to_utf8(&[], buf, true);
467                return Ok(nout);
468            }
469        }
470    }
471
472    /// Like transcode, but deals with the case where the caller provided
473    /// buffer is less than 4.
474    fn tiny_transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
475        assert!(buf.len() < 4, "have a small caller buffer");
476        loop {
477            let (nin, nout) = self.tiny.transcode(
478                self.decoder.as_mut().unwrap(),
479                &self.buf.as_mut()[self.pos..self.buflen],
480                false,
481            );
482            self.pos += nin;
483            if nout > 0 {
484                // We've satisfied the contract of writing at least one byte,
485                // so we're done. The tiny transcoder is guaranteed to yield
486                // a non-zero number of bytes.
487                return self.tiny.read(buf);
488            }
489            // Otherwise, we know that our internal buffer has insufficient
490            // data to transcode at least one char, so we attempt to refill it.
491            self.fill()?;
492            // ... but quit on EOF.
493            if self.buflen == 0 {
494                self.tiny.transcode(self.decoder.as_mut().unwrap(), &[], true);
495                return self.tiny.read(buf);
496            }
497        }
498    }
499
500    /// Peeks at the underlying reader to look for a BOM. If one exists, then
501    /// an appropriate decoder is created corresponding to the detected BOM.
502    fn detect(&mut self) -> io::Result<()> {
503        if self.has_detected {
504            return Ok(());
505        }
506        self.has_detected = true;
507        let bom = self.rdr.peek_bom()?;
508        if let Some(encoding) = bom.encoding() {
509            // If we got a UTF-8 BOM, and the decoder was configured for
510            // passing through UTF-8, then don't build a decoder at all.
511            if encoding == UTF_8 && self.utf8_passthru {
512                return Ok(());
513            }
514            self.decoder = Some(encoding.new_decoder_with_bom_removal());
515        }
516        Ok(())
517    }
518
519    /// Fill the internal buffer from the underlying reader.
520    ///
521    /// If there are unread bytes in the internal buffer, then we move them
522    /// to the beginning of the internal buffer and fill the remainder.
523    ///
524    /// If the internal buffer is too small to read additional bytes, then an
525    /// error is returned.
526    fn fill(&mut self) -> io::Result<()> {
527        if self.pos < self.buflen {
528            // Despite my best efforts, I could not seem to actually exercise
529            // this code path in tests. Namely, this code path occurs when the
530            // decoder can't make any progress and also doesn't consume all of
531            // the input. Since I'm not sure how to trigger that case, this
532            // code path is actually untested!
533
534            // We can assert this because we require that the caller provided
535            // buffer be at least 4 bytes big.
536            assert!(
537                self.buflen < self.buf.as_mut().len(),
538                "internal buffer should never be exhausted"
539            );
540            let buf = self.buf.as_mut();
541            for (dst, src) in (self.pos..self.buflen).enumerate() {
542                buf[dst] = buf[src];
543            }
544            self.buflen -= self.pos;
545        } else {
546            self.buflen = 0;
547        }
548        self.pos = 0;
549        self.buflen += self.rdr.read(&mut self.buf.as_mut()[self.buflen..])?;
550        if self.buflen == 0 {
551            self.exhausted = true;
552        }
553        Ok(())
554    }
555}
556
557impl<R: fmt::Debug, B: fmt::Debug> fmt::Debug for DecodeReaderBytes<R, B> {
558    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
559        let mut fmter = f.debug_struct("DecodeReaderBytes");
560        fmter
561            .field("rdr", &self.rdr)
562            .field("tiny", &self.tiny)
563            .field("utf8_passthru", &self.utf8_passthru)
564            .field("buf", &self.buf)
565            .field("pos", &self.pos)
566            .field("buflen", &self.buflen)
567            .field("has_detected", &self.has_detected)
568            .field("exhausted", &self.exhausted);
569        // Because `encoding_rs::Decoder` doesn't impl `fmt::Debug`.
570        if let Some(ref d) = self.decoder {
571            let msg = format!("Some(<Decoder for {}>)", d.encoding().name());
572            fmter.field("decoder", &msg);
573        } else {
574            fmter.field("decoder", &"None");
575        }
576        fmter.finish()
577    }
578}
579
580#[cfg(test)]
581mod tests {
582    use std::io::Read;
583
584    use encoding_rs::{self, Encoding};
585
586    use super::{DecodeReaderBytes, DecodeReaderBytesBuilder};
587
588    fn read_to_string<R: Read>(mut rdr: R) -> String {
589        let mut s = String::new();
590        rdr.read_to_string(&mut s).unwrap();
591        s
592    }
593
594    // In cases where all we have is a bom, we expect the bytes to be
595    // passed through unchanged.
596    #[test]
597    fn trans_utf16_bom() {
598        let srcbuf = vec![0xFF, 0xFE];
599        let mut dstbuf = vec![0; 8 * (1 << 10)];
600        let mut rdr = DecodeReaderBytes::new(&*srcbuf);
601        let n = rdr.read(&mut dstbuf).unwrap();
602        assert_eq!(&*srcbuf, &dstbuf[..n]);
603
604        let srcbuf = vec![0xFE, 0xFF];
605        let mut rdr = DecodeReaderBytes::new(&*srcbuf);
606        let n = rdr.read(&mut dstbuf).unwrap();
607        assert_eq!(&*srcbuf, &dstbuf[..n]);
608
609        let srcbuf = vec![0xEF, 0xBB, 0xBF];
610        let mut rdr = DecodeReaderBytes::new(&*srcbuf);
611        let n = rdr.read(&mut dstbuf).unwrap();
612        assert_eq!(n, 0);
613
614        let srcbuf = vec![0xEF, 0xBB, 0xBF];
615        let mut rdr = DecodeReaderBytesBuilder::new()
616            .utf8_passthru(true)
617            .build(&*srcbuf);
618        let n = rdr.read(&mut dstbuf).unwrap();
619        assert_eq!(&*srcbuf, &dstbuf[..n]);
620    }
621
622    // Test basic UTF-16 decoding.
623    #[test]
624    fn trans_utf16_basic() {
625        let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
626        let mut rdr = DecodeReaderBytes::new(&*srcbuf);
627        assert_eq!("a", read_to_string(&mut rdr));
628
629        let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61];
630        let mut rdr = DecodeReaderBytes::new(&*srcbuf);
631        assert_eq!("a", read_to_string(&mut rdr));
632    }
633
634    #[test]
635    fn trans_utf16_basic_without_bom() {
636        let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
637        let mut rdr =
638            DecodeReaderBytesBuilder::new().strip_bom(true).build(&*srcbuf);
639        assert_eq!("a", read_to_string(&mut rdr));
640
641        let srcbuf = vec![0xFE, 0xFF, 0x00, 0x61];
642        let mut rdr =
643            DecodeReaderBytesBuilder::new().strip_bom(true).build(&*srcbuf);
644        assert_eq!("a", read_to_string(&mut rdr));
645    }
646
647    // Test the BOM override.
648    #[test]
649    fn trans_utf16_bom_override() {
650        let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00];
651        let mut rdr = DecodeReaderBytesBuilder::new()
652            .bom_override(true)
653            .encoding(Some(encoding_rs::UTF_8))
654            .build(&*srcbuf);
655        assert_eq!("a", read_to_string(&mut rdr));
656    }
657
658    // Test basic UTF-16 decoding with a small  buffer.
659    #[test]
660    fn trans_utf16_smallbuf() {
661        let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00];
662        let mut rdr = DecodeReaderBytes::new(&*srcbuf);
663        let mut tmp = [0u8; 1];
664
665        let nread = rdr.read(&mut tmp).unwrap();
666        assert_eq!(nread, 1);
667        assert_eq!(tmp, [b'a'; 1]);
668
669        let nread = rdr.read(&mut tmp).unwrap();
670        assert_eq!(nread, 1);
671        assert_eq!(tmp, [b'b'; 1]);
672
673        let nread = rdr.read(&mut tmp).unwrap();
674        assert_eq!(nread, 1);
675        assert_eq!(tmp, [b'c'; 1]);
676
677        let nread = rdr.read(&mut tmp).unwrap();
678        assert_eq!(nread, 0);
679    }
680
681    // Test incomplete UTF-16 decoding. This ensures we see a replacement char
682    // if the stream ends with an unpaired code unit.
683    #[test]
684    fn trans_utf16_incomplete() {
685        let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x00];
686        let mut rdr = DecodeReaderBytes::new(&*srcbuf);
687        assert_eq!("a\u{FFFD}", read_to_string(&mut rdr));
688    }
689
690    // Test transcoding with a minimal buffer but a large caller buffer.
691    #[test]
692    fn trans_utf16_minimal_buffer_normal_caller_buffer() {
693        #[rustfmt::skip]
694        let srcbuf = vec![
695            0xFF, 0xFE,
696            0x61, 0x00,
697            0x62, 0x00,
698            0x63, 0x00,
699            0x64, 0x00,
700            0x65, 0x00,
701            0x66, 0x00,
702            0x67, 0x00,
703            0x68, 0x00,
704        ];
705        let mut rdr = DecodeReaderBytesBuilder::new()
706            .build_with_buffer(&*srcbuf, vec![0; 4])
707            .unwrap();
708        let got = read_to_string(&mut rdr);
709        assert_eq!(got, "abcdefgh");
710    }
711
712    // Test transcoding with a minimal buffer and a minimal caller buffer.
713    #[test]
714    fn trans_utf16_minimal_buffers() {
715        let srcbuf = vec![0xFF, 0xFE, 0x61, 0x00, 0x62, 0x00, 0x63, 0x00];
716        let mut rdr = DecodeReaderBytesBuilder::new()
717            .build_with_buffer(&*srcbuf, vec![0; 4])
718            .unwrap();
719        let mut tmp = [0u8; 1];
720
721        let nread = rdr.read(&mut tmp).unwrap();
722        assert_eq!(nread, 1);
723        assert_eq!(tmp, [b'a'; 1]);
724
725        let nread = rdr.read(&mut tmp).unwrap();
726        assert_eq!(nread, 1);
727        assert_eq!(tmp, [b'b'; 1]);
728
729        let nread = rdr.read(&mut tmp).unwrap();
730        assert_eq!(nread, 1);
731        assert_eq!(tmp, [b'c'; 1]);
732
733        let nread = rdr.read(&mut tmp).unwrap();
734        assert_eq!(nread, 0);
735    }
736
737    // Test transcoding with using byte oriented APIs.
738    #[test]
739    fn trans_utf16_byte_api() {
740        #[rustfmt::skip]
741        let srcbuf = vec![
742            0xFF, 0xFE,
743            0x61, 0x00,
744            0x62, 0x00,
745            0x63, 0x00,
746            0x64, 0x00,
747            0x65, 0x00,
748            0x66, 0x00,
749            0x67, 0x00,
750            0x68, 0x00,
751        ];
752        let rdr = DecodeReaderBytes::new(&*srcbuf);
753        let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
754        assert_eq!(got, b"abcdefgh");
755    }
756
757    #[test]
758    fn trans_utf16_no_sniffing() {
759        #[rustfmt::skip]
760        let srcbuf = vec![
761            0xFF, 0xFE,
762            0x61, 0x00,
763        ];
764        let rdr = DecodeReaderBytesBuilder::new()
765            .bom_sniffing(false)
766            .build(&*srcbuf);
767        let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
768        assert_eq!(got, srcbuf);
769    }
770
771    #[test]
772    fn trans_utf16_no_sniffing_strip_bom() {
773        #[rustfmt::skip]
774        let srcbuf = vec![
775            0xFF, 0xFE,
776            0x61, 0x00,
777        ];
778        let rdr = DecodeReaderBytesBuilder::new()
779            .bom_sniffing(false)
780            .strip_bom(true)
781            .build(&*srcbuf);
782        let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
783        assert_eq!(got, &[0x61, 0x00]);
784    }
785
786    #[test]
787    fn trans_utf16_no_sniffing_encoding_override() {
788        #[rustfmt::skip]
789        let srcbuf = vec![
790            0xFF, 0xFE,
791            0x61, 0x00,
792        ];
793        let rdr = DecodeReaderBytesBuilder::new()
794            .bom_sniffing(false)
795            .encoding(Some(encoding_rs::UTF_16LE))
796            .build(&*srcbuf);
797        let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
798        assert_eq!(got, b"a");
799    }
800
801    #[test]
802    fn trans_utf16_no_sniffing_encoding_override_strip_bom() {
803        #[rustfmt::skip]
804        let srcbuf = vec![
805            0xFF, 0xFE,
806            0x61, 0x00,
807        ];
808        let rdr = DecodeReaderBytesBuilder::new()
809            .bom_sniffing(false)
810            .strip_bom(true)
811            .encoding(Some(encoding_rs::UTF_16LE))
812            .build(&*srcbuf);
813        let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
814        assert_eq!(got, b"a");
815    }
816
817    // Test transcoding with a minimal buffer using byte oriented APIs.
818    #[test]
819    fn trans_utf16_minimal_buffer_byte_api() {
820        #[rustfmt::skip]
821        let srcbuf = vec![
822            0xFF, 0xFE,
823            0x61, 0x00,
824            0x62, 0x00,
825            0x63, 0x00,
826            0x64, 0x00,
827            0x65, 0x00,
828            0x66, 0x00,
829            0x67, 0x00,
830            0x68, 0x00,
831        ];
832        let rdr = DecodeReaderBytesBuilder::new()
833            .build_with_buffer(&*srcbuf, vec![0; 4])
834            .unwrap();
835        let got: Vec<u8> = rdr.bytes().map(|res| res.unwrap()).collect();
836        assert_eq!(got, b"abcdefgh");
837    }
838
839    // Test a buffer that is too small.
840    #[test]
841    fn buffer_too_small() {
842        let res = DecodeReaderBytesBuilder::new()
843            .build_with_buffer(&[][..], vec![0; 3]);
844        assert!(res.is_err());
845    }
846
847    macro_rules! test_trans_simple {
848        ($name:ident, $enc:expr, $srcbytes:expr, $dst:expr) => {
849            #[test]
850            fn $name() {
851                let srcbuf = &$srcbytes[..];
852                let enc = Encoding::for_label($enc.as_bytes());
853                let mut rdr = DecodeReaderBytesBuilder::new()
854                    .encoding(enc)
855                    .build(&*srcbuf);
856                assert_eq!($dst, read_to_string(&mut rdr));
857            }
858        };
859    }
860
861    // This isn't exhaustive obviously, but it lets us test base level support.
862    test_trans_simple!(trans_simple_auto, "does not exist", b"\xD0\x96", "Ж");
863    test_trans_simple!(trans_simple_utf8, "utf-8", b"\xD0\x96", "Ж");
864    test_trans_simple!(trans_simple_utf16le, "utf-16le", b"\x16\x04", "Ж");
865    test_trans_simple!(trans_simple_utf16be, "utf-16be", b"\x04\x16", "Ж");
866    test_trans_simple!(trans_simple_chinese, "chinese", b"\xA7\xA8", "Ж");
867    test_trans_simple!(trans_simple_korean, "korean", b"\xAC\xA8", "Ж");
868    test_trans_simple!(
869        trans_simple_big5_hkscs,
870        "big5-hkscs",
871        b"\xC7\xFA",
872        "Ж"
873    );
874    test_trans_simple!(trans_simple_gbk, "gbk", b"\xA7\xA8", "Ж");
875    test_trans_simple!(trans_simple_sjis, "sjis", b"\x84\x47", "Ж");
876    test_trans_simple!(trans_simple_eucjp, "euc-jp", b"\xA7\xA8", "Ж");
877    test_trans_simple!(trans_simple_latin1, "latin1", b"\xA9", "©");
878}
encoding_rs_io/lib.rs

encoding_rs_io/
lib.rs