use std::borrow::Cow;
use std::io;
use log::trace;
use encoding_rs as enc;
use enc::DecoderResult;
use tendril::{Tendril, TendrilSink, Atomicity, NonAtomic};
use tendril::fmt as form;
use tendril::stream::Utf8LossyDecoder;
mod encoding_hint;
pub use encoding_hint::{
EncodingHint, SharedEncodingHint,
};
use crate::READ_BUFFER_SIZE;
pub struct Decoder<Sink, A=NonAtomic>
where Sink: TendrilSink<form::UTF8, A>, A: Atomicity
{
mode: Mode<Sink, A>,
}
enum Mode<Sink, A>
where Sink: TendrilSink<form::UTF8, A>, A: Atomicity
{
Utf8(Utf8LossyDecoder<Sink, A>),
Other(enc::Decoder, Sink),
}
impl<Sink, A> Decoder<Sink, A>
where Sink: TendrilSink<form::UTF8, A>, A: Atomicity
{
pub fn new(encoding: &'static enc::Encoding, sink: Sink) -> Self {
let mode = if encoding == enc::UTF_8 {
Mode::Utf8(Utf8LossyDecoder::new(sink))
} else {
Mode::Other(encoding.new_decoder(), sink)
};
Decoder { mode }
}
pub fn inner_sink(&self) -> &Sink {
match self.mode {
Mode::Utf8(ref utf8) => &utf8.inner_sink,
Mode::Other(_, ref inner_sink) => inner_sink,
}
}
pub fn read_to_end<R>(mut self, r: &mut R)
-> Result<Sink::Output, io::Error>
where Self: Sized, R: io::Read
{
loop {
let mut tendril = Tendril::<form::Bytes, A>::new();
unsafe {
tendril.push_uninitialized(READ_BUFFER_SIZE);
}
loop {
match r.read(&mut tendril) {
Ok(0) => return Ok(self.finish()),
Ok(n) => {
tendril.pop_back(READ_BUFFER_SIZE - n as u32);
self.process(tendril);
break;
}
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
Err(e) => return Err(e)
}
} } }
}
impl<Sink, A> TendrilSink<form::Bytes, A> for Decoder<Sink, A>
where Sink: TendrilSink<form::UTF8, A>, A: Atomicity
{
type Output = Sink::Output;
fn process(&mut self, t: Tendril<form::Bytes, A>) {
match self.mode {
Mode::Utf8(ref mut utf8) => utf8.process(t),
Mode::Other(ref mut decoder, ref mut sink) => {
if t.is_empty() {
return;
}
decode_to_sink(t, decoder, sink, false);
},
}
}
fn error(&mut self, desc: Cow<'static, str>) {
match self.mode {
Mode::Utf8(ref mut utf8) => utf8.error(desc),
Mode::Other(_, ref mut sink) => sink.error(desc),
}
}
fn finish(self) -> Sink::Output {
match self.mode {
Mode::Utf8(utf8) => utf8.finish(),
Mode::Other(mut decoder, mut sink) => {
decode_to_sink(Tendril::new(), &mut decoder, &mut sink, true);
sink.finish()
}
}
}
}
fn decode_to_sink<Sink, A>(
mut inpt: Tendril<form::Bytes, A>,
decoder: &mut enc::Decoder,
sink: &mut Sink,
last: bool)
where Sink: TendrilSink<form::UTF8, A>, A: Atomicity
{
loop {
let mut outt = <Tendril<form::Bytes, A>>::new();
let len = decoder
.max_utf8_buffer_length(inpt.len())
.unwrap_or(READ_BUFFER_SIZE as usize);
let len = std::cmp::min(len as u32, READ_BUFFER_SIZE);
trace!("decode buffer len {}", len);
unsafe { outt.push_uninitialized(len); }
let (result, bytes_read, bytes_written) =
decoder.decode_to_utf8_without_replacement(&inpt, &mut outt, last);
if bytes_written > 0 {
sink.process(unsafe {
outt.subtendril(0, bytes_written as u32)
.reinterpret_without_validating()
});
}
match result {
DecoderResult::InputEmpty => break,
DecoderResult::OutputFull => {
trace!("decode OutputFull");
},
DecoderResult::Malformed(_, _) => {
sink.error(Cow::Borrowed("invalid byte sequence"));
sink.process("\u{FFFD}".into());
},
}
inpt.pop_front(bytes_read as u32);
if inpt.is_empty() {
break;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use tendril::SliceExt;
struct Accumulate<A>
where A: Atomicity
{
tendrils: Vec<Tendril<form::UTF8, A>>,
errors: Vec<String>,
}
impl<A> Accumulate<A>
where A: Atomicity
{
fn new() -> Accumulate<A> {
Accumulate {
tendrils: vec![],
errors: vec![],
}
}
}
impl<A> TendrilSink<form::UTF8, A> for Accumulate<A>
where A: Atomicity
{
type Output = (Vec<Tendril<form::UTF8, A>>, Vec<String>);
fn process(&mut self, t: Tendril<form::UTF8, A>) {
self.tendrils.push(t);
}
fn error(&mut self, desc: Cow<'static, str>) {
self.errors.push(desc.into_owned());
}
fn finish(self) -> Self::Output {
(self.tendrils, self.errors)
}
}
fn check_decode(
mut decoder: Decoder<Accumulate<NonAtomic>>,
input: &[&[u8]],
expected: &str,
errs: usize)
{
for x in input {
decoder.process(x.to_tendril());
}
let (tendrils, errors) = decoder.finish();
let mut tendril: Tendril<form::UTF8> = Tendril::new();
for t in tendrils {
tendril.push_tendril(&t);
}
assert_eq!(expected, &*tendril);
assert_eq!(errs, errors.len());
}
pub type Tests = &'static [(&'static [&'static [u8]], &'static str, usize)];
const UTF_8: Tests = &[
(&[], "", 0),
(&[b""], "", 0),
(&[b"xyz"], "xyz", 0),
(&[b"x", b"y", b"z"], "xyz", 0),
(&[b"\xEA\x99\xAE"], "\u{a66e}", 0),
(&[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0),
(&[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0),
(&[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0),
(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0),
(&[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], "\u{a66e}", 0),
(&[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0),
(&[b"xy\xEA", b"\xFF", b"\x99\xAEz"],
"xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", 4),
(&[b"xy\xEA\x99", b"\xFFz"], "xy\u{fffd}\u{fffd}z", 2),
(&[b"\xC0"], "\u{fffd}", 1),
(&[b"\xEA\x99"], "\u{fffd}", 1),
];
#[test]
fn decode_utf8_encoding_rs() {
for &(input, expected, errs) in UTF_8 {
let decoder = Decoder::new(enc::UTF_8, Accumulate::new());
check_decode(decoder, input, expected, errs);
}
}
const KOI8_U: Tests = &[
(&[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0),
(&[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0),
(&[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0),
(&[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], "Энергия", 0),
];
#[test]
fn decode_koi8_u_encoding_rs() {
for &(input, expected, errs) in KOI8_U {
let decoder = Decoder::new(enc::KOI8_U, Accumulate::new());
check_decode(decoder, input, expected, errs);
}
}
const WINDOWS_949: Tests = &[
(&[], "", 0),
(&[b""], "", 0),
(&[b"\xbe\xc8\xb3\xe7"], "안녕", 0),
(&[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0),
(&[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0),
(&[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"], "안녕하세요", 0),
(&[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1),
(&[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1),
(&[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1),
];
#[test]
fn decode_windows_949_encoding_rs() {
for &(input, expected, errs) in WINDOWS_949 {
let decoder = Decoder::new(enc::EUC_KR, Accumulate::new());
check_decode(decoder, input, expected, errs);
}
}
}