1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
use std::io::{self, BufRead, Read};

use encoding_rs::{CoderResult, Decoder};

use crate::charsets::Charset;

/// `TextReader` converts bytes in a specific charset to bytes in UTF-8.
///
/// It can be used to convert a stream of text in a specific charset into a stream
/// of UTF-8 encoded bytes. The `Read::read_to_string` method can be used to convert
/// the stream of UTF-8 bytes into a `String`.
pub struct TextReader<R>
where
    R: BufRead,
{
    inner: R,
    decoder: Decoder,
    eof: bool,
}

impl<R> TextReader<R>
where
    R: BufRead,
{
    /// Create a new `TextReader` with the given charset.
    pub fn new(inner: R, charset: Charset) -> TextReader<R> {
        TextReader {
            inner: inner,
            decoder: charset.new_decoder(),
            eof: false,
        }
    }
}

impl<R> Read for TextReader<R>
where
    R: BufRead,
{
    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
        if self.eof {
            return Ok(0);
        }

        let mut total_written = 0;

        loop {
            let src = self.inner.fill_buf()?;
            if src.is_empty() {
                // inner has reached EOF, write last to the buffer.
                let (res, _, written, _) = self.decoder.decode_to_utf8(src, buf, true);
                total_written += written;

                match res {
                    CoderResult::InputEmpty => {
                        // last call was successful, set eof to true
                        self.eof = true;
                        break;
                    }
                    CoderResult::OutputFull => {
                        // last call was not successful because the output is full, try again in the next call to `read`
                        break;
                    }
                }
            } else {
                let (res, read, written, _) = self.decoder.decode_to_utf8(src, buf, false);

                self.inner.consume(read);
                total_written += written;

                match res {
                    CoderResult::InputEmpty => {
                        // read all the bytes available in src, read more
                        continue;
                    }
                    CoderResult::OutputFull => {
                        // buf is full, break and return the number read
                        break;
                    }
                }
            }
        }

        Ok(total_written)
    }
}

#[test]
fn test_stream_decoder_utf8() {
    let mut reader = TextReader::new("québec".as_bytes(), crate::charsets::UTF_8);

    let mut text = String::new();
    assert_eq!(reader.read_to_string(&mut text).ok(), Some(7));

    assert_eq!(text, "québec");
}

#[test]
fn test_stream_decoder_latin1() {
    let mut reader = TextReader::new(&b"qu\xC9bec"[..], crate::charsets::WINDOWS_1252);

    let mut text = String::new();
    assert_eq!(reader.read_to_string(&mut text).ok(), Some(7));

    assert_eq!(text, "quÉbec");
}

#[test]
fn test_string_reader_large_buffer_latin1() {
    let mut buf = vec![];
    for _ in 0..10_000 {
        buf.push(201);
    }
    let mut reader = TextReader::new(&buf[..], crate::charsets::WINDOWS_1252);

    let mut text = String::new();
    reader.read_to_string(&mut text).unwrap();

    for c in text.chars() {
        assert_eq!(c, 'É');
    }
}