use tendril::{Tendril, Atomicity, NonAtomic};
use fmt;
use std::borrow::Cow;
use std::fs::File;
use std::io;
use std::marker::PhantomData;
use std::path::Path;
#[cfg(feature = "encoding")] use encoding::{EncodingRef, RawDecoder};
use utf8;
pub trait TendrilSink<F, A=NonAtomic>
where F: fmt::Format,
A: Atomicity,
{
fn process(&mut self, t: Tendril<F, A>);
fn error(&mut self, desc: Cow<'static, str>);
type Output;
fn finish(self) -> Self::Output;
fn one<T>(mut self, t: T) -> Self::Output where Self: Sized, T: Into<Tendril<F, A>> {
self.process(t.into());
self.finish()
}
fn from_iter<I>(mut self, i: I) -> Self::Output
where Self: Sized, I: IntoIterator, I::Item: Into<Tendril<F, A>> {
for t in i {
self.process(t.into())
}
self.finish()
}
fn read_from<R>(mut self, r: &mut R) -> io::Result<Self::Output>
where Self: Sized, R: io::Read, F: fmt::SliceFormat<Slice=[u8]> {
const BUFFER_SIZE: u32 = 4 * 1024;
loop {
let mut tendril = Tendril::<F, A>::new();
unsafe {
tendril.push_uninitialized(BUFFER_SIZE);
}
loop {
match r.read(&mut tendril) {
Ok(0) => return Ok(self.finish()),
Ok(n) => {
tendril.pop_back(BUFFER_SIZE - n as u32);
self.process(tendril);
break
}
Err(ref e) if e.kind() == io::ErrorKind::Interrupted => {}
Err(e) => return Err(e)
}
}
}
}
fn from_file<P>(self, path: P) -> io::Result<Self::Output>
where Self: Sized, P: AsRef<Path>, F: fmt::SliceFormat<Slice=[u8]> {
self.read_from(&mut try!(File::open(path)))
}
}
pub struct Utf8LossyDecoder<Sink, A=NonAtomic>
where Sink: TendrilSink<fmt::UTF8, A>,
A: Atomicity
{
pub inner_sink: Sink,
incomplete: Option<utf8::Incomplete>,
marker: PhantomData<A>,
}
impl<Sink, A> Utf8LossyDecoder<Sink, A>
where Sink: TendrilSink<fmt::UTF8, A>,
A: Atomicity,
{
#[inline]
pub fn new(inner_sink: Sink) -> Self {
Utf8LossyDecoder {
inner_sink: inner_sink,
incomplete: None,
marker: PhantomData,
}
}
}
impl<Sink, A> TendrilSink<fmt::Bytes, A> for Utf8LossyDecoder<Sink, A>
where Sink: TendrilSink<fmt::UTF8, A>,
A: Atomicity,
{
#[inline]
fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) {
if let Some(mut incomplete) = self.incomplete.take() {
let resume_at = incomplete.try_complete(&t).map(|(result, rest)| {
match result {
Ok(s) => {
self.inner_sink.process(Tendril::from_slice(s))
}
Err(_) => {
self.inner_sink.error("invalid byte sequence".into());
self.inner_sink.process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
}
}
t.len() - rest.len()
});
match resume_at {
None => {
self.incomplete = Some(incomplete);
return
}
Some(resume_at) => {
t.pop_front(resume_at as u32)
}
}
}
while !t.is_empty() {
let unborrowed_result = match utf8::decode(&t) {
Ok(s) => {
debug_assert!(s.as_ptr() == t.as_ptr());
debug_assert!(s.len() == t.len());
Ok(())
}
Err(utf8::DecodeError::Invalid { valid_prefix, invalid_sequence, .. }) => {
debug_assert!(valid_prefix.as_ptr() == t.as_ptr());
debug_assert!(valid_prefix.len() <= t.len());
Err((valid_prefix.len(), Err(valid_prefix.len() + invalid_sequence.len())))
}
Err(utf8::DecodeError::Incomplete { valid_prefix, incomplete_suffix }) => {
debug_assert!(valid_prefix.as_ptr() == t.as_ptr());
debug_assert!(valid_prefix.len() <= t.len());
Err((valid_prefix.len(), Ok(incomplete_suffix)))
}
};
match unborrowed_result {
Ok(()) => {
unsafe {
self.inner_sink.process(t.reinterpret_without_validating())
}
return
}
Err((valid_len, and_then)) => {
if valid_len > 0 {
let subtendril = t.subtendril(0, valid_len as u32);
unsafe {
self.inner_sink.process(subtendril.reinterpret_without_validating())
}
}
match and_then {
Ok(incomplete) => {
self.incomplete = Some(incomplete);
return
}
Err(offset) => {
self.inner_sink.error("invalid byte sequence".into());
self.inner_sink.process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
t.pop_front(offset as u32);
}
}
}
}
}
}
#[inline]
fn error(&mut self, desc: Cow<'static, str>) {
self.inner_sink.error(desc);
}
type Output = Sink::Output;
#[inline]
fn finish(mut self) -> Sink::Output {
if self.incomplete.is_some() {
self.inner_sink.error("incomplete byte sequence at end of stream".into());
self.inner_sink.process(Tendril::from_slice(utf8::REPLACEMENT_CHARACTER));
}
self.inner_sink.finish()
}
}
#[cfg(feature = "encoding")]
pub struct LossyDecoder<Sink, A=NonAtomic>
where Sink: TendrilSink<fmt::UTF8, A>,
A: Atomicity {
inner: LossyDecoderInner<Sink, A>,
}
#[cfg(feature = "encoding")]
enum LossyDecoderInner<Sink, A>
where Sink: TendrilSink<fmt::UTF8, A>,
A: Atomicity {
Utf8(Utf8LossyDecoder<Sink, A>),
Other(Box<RawDecoder>, Sink)
}
#[cfg(feature = "encoding")]
impl<Sink, A> LossyDecoder<Sink, A>
where Sink: TendrilSink<fmt::UTF8, A>,
A: Atomicity,
{
#[inline]
pub fn new(encoding: EncodingRef, sink: Sink) -> LossyDecoder<Sink, A> {
if encoding.name() == "utf-8" {
LossyDecoder::utf8(sink)
} else {
LossyDecoder {
inner: LossyDecoderInner::Other(encoding.raw_decoder(), sink)
}
}
}
#[inline]
pub fn utf8(sink: Sink) -> LossyDecoder<Sink, A> {
LossyDecoder {
inner: LossyDecoderInner::Utf8(Utf8LossyDecoder::new(sink))
}
}
pub fn inner_sink(&self) -> &Sink {
match self.inner {
LossyDecoderInner::Utf8(ref utf8) => &utf8.inner_sink,
LossyDecoderInner::Other(_, ref inner_sink) => inner_sink,
}
}
pub fn inner_sink_mut(&mut self) -> &mut Sink {
match self.inner {
LossyDecoderInner::Utf8(ref mut utf8) => &mut utf8.inner_sink,
LossyDecoderInner::Other(_, ref mut inner_sink) => inner_sink,
}
}
}
#[cfg(feature = "encoding")]
impl<Sink, A> TendrilSink<fmt::Bytes, A> for LossyDecoder<Sink, A>
where Sink: TendrilSink<fmt::UTF8, A>,
A: Atomicity,
{
#[inline]
fn process(&mut self, mut t: Tendril<fmt::Bytes, A>) {
let (decoder, sink) = match self.inner {
LossyDecoderInner::Utf8(ref mut utf8) => return utf8.process(t),
LossyDecoderInner::Other(ref mut decoder, ref mut sink) => (decoder, sink),
};
let mut out = Tendril::new();
loop {
match decoder.raw_feed(&*t, &mut out) {
(_, Some(err)) => {
out.push_char('\u{fffd}');
sink.error(err.cause);
debug_assert!(err.upto >= 0);
t.pop_front(err.upto as u32);
}
(_, None) => break,
}
}
if out.len() > 0 {
sink.process(out);
}
}
#[inline]
fn error(&mut self, desc: Cow<'static, str>) {
match self.inner {
LossyDecoderInner::Utf8(ref mut utf8) => utf8.error(desc),
LossyDecoderInner::Other(_, ref mut sink) => sink.error(desc),
}
}
type Output = Sink::Output;
#[inline]
fn finish(self) -> Sink::Output {
let (mut decoder, mut sink) = match self.inner {
LossyDecoderInner::Utf8(utf8) => return utf8.finish(),
LossyDecoderInner::Other(decoder, sink) => (decoder, sink),
};
let mut out = Tendril::new();
if let Some(err) = decoder.raw_finish(&mut out) {
out.push_char('\u{fffd}');
sink.error(err.cause);
}
if out.len() > 0 {
sink.process(out);
}
sink.finish()
}
}
#[cfg(test)]
mod test {
use super::{TendrilSink, Utf8LossyDecoder};
use tendril::{Tendril, Atomicity, NonAtomic};
use fmt;
use std::borrow::Cow;
#[cfg(feature = "encoding")] use encoding::EncodingRef;
#[cfg(feature = "encoding")] use encoding::all as enc;
#[cfg(feature = "encoding")] use super::LossyDecoder;
#[cfg(feature = "encoding")] use tendril::SliceExt;
struct Accumulate<A>
where A: Atomicity,
{
tendrils: Vec<Tendril<fmt::UTF8, A>>,
errors: Vec<String>,
}
impl<A> Accumulate<A>
where A: Atomicity,
{
fn new() -> Accumulate<A> {
Accumulate {
tendrils: vec![],
errors: vec![],
}
}
}
impl<A> TendrilSink<fmt::UTF8, A> for Accumulate<A>
where A: Atomicity,
{
fn process(&mut self, t: Tendril<fmt::UTF8, A>) {
self.tendrils.push(t);
}
fn error(&mut self, desc: Cow<'static, str>) {
self.errors.push(desc.into_owned());
}
type Output = (Vec<Tendril<fmt::UTF8, A>>, Vec<String>);
fn finish(self) -> Self::Output {
(self.tendrils, self.errors)
}
}
fn check_utf8(input: &[&[u8]], expected: &[&str], errs: usize) {
let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new());
let (tendrils, errors) = decoder.from_iter(input.iter().cloned());
assert_eq!(expected, &*tendrils.iter().map(|t| &**t).collect::<Vec<_>>());
assert_eq!(errs, errors.len());
}
#[test]
fn utf8() {
check_utf8(&[], &[], 0);
check_utf8(&[b""], &[], 0);
check_utf8(&[b"xyz"], &["xyz"], 0);
check_utf8(&[b"x", b"y", b"z"], &["x", "y", "z"], 0);
check_utf8(&[b"xy\xEA\x99\xAEzw"], &["xy\u{a66e}zw"], 0);
check_utf8(&[b"xy\xEA", b"\x99\xAEzw"], &["xy", "\u{a66e}z", "w"], 0);
check_utf8(&[b"xy\xEA\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0);
check_utf8(&[b"xy\xEA", b"\x99", b"\xAEzw"], &["xy", "\u{a66e}z", "w"], 0);
check_utf8(&[b"\xEA", b"", b"\x99", b"", b"\xAE"], &["\u{a66e}"], 0);
check_utf8(&[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], &["\u{a66e}"], 0);
check_utf8(&[b"xy\xEA", b"\xFF", b"\x99\xAEz"],
&["xy", "\u{fffd}", "\u{fffd}", "\u{fffd}", "\u{fffd}", "z"], 4);
check_utf8(&[b"xy\xEA\x99", b"\xFFz"],
&["xy", "\u{fffd}", "\u{fffd}", "z"], 2);
check_utf8(&[b"\xC5\x91\xC5\x91\xC5\x91"], &["őőő"], 0);
check_utf8(&[b"\xC5\x91", b"\xC5\x91", b"\xC5\x91"], &["ő", "ő", "ő"], 0);
check_utf8(&[b"\xC5", b"\x91\xC5", b"\x91\xC5", b"\x91"],
&["ő", "ő", "ő"], 0);
check_utf8(&[b"\xC5", b"\x91\xff", b"\x91\xC5", b"\x91"],
&["ő", "\u{fffd}", "\u{fffd}", "ő"], 2);
check_utf8(&[b"\xC0"], &["\u{fffd}"], 1);
check_utf8(&[b"\xEA\x99"], &["\u{fffd}"], 1);
}
#[cfg(feature = "encoding")]
fn check_decode(enc: EncodingRef, input: &[&[u8]], expected: &str, errs: usize) {
let mut decoder = LossyDecoder::new(enc, Accumulate::new());
for x in input {
decoder.process(x.to_tendril());
}
let (tendrils, errors) = decoder.finish();
let mut tendril: Tendril<fmt::UTF8> = Tendril::new();
for t in tendrils {
tendril.push_tendril(&t);
}
assert_eq!(expected, &*tendril);
assert_eq!(errs, errors.len());
}
#[test]
#[cfg(feature = "encoding")]
fn decode_ascii() {
check_decode(enc::ASCII, &[], "", 0);
check_decode(enc::ASCII, &[b""], "", 0);
check_decode(enc::ASCII, &[b"xyz"], "xyz", 0);
check_decode(enc::ASCII, &[b"xy", b"", b"", b"z"], "xyz", 0);
check_decode(enc::ASCII, &[b"x", b"y", b"z"], "xyz", 0);
check_decode(enc::ASCII, &[b"\xFF"], "\u{fffd}", 1);
check_decode(enc::ASCII, &[b"x\xC0yz"], "x\u{fffd}yz", 1);
check_decode(enc::ASCII, &[b"x", b"\xC0y", b"z"], "x\u{fffd}yz", 1);
check_decode(enc::ASCII, &[b"x\xC0yz\xFF\xFFw"], "x\u{fffd}yz\u{fffd}\u{fffd}w", 3);
}
#[test]
#[cfg(feature = "encoding")]
fn decode_utf8() {
check_decode(enc::UTF_8, &[], "", 0);
check_decode(enc::UTF_8, &[b""], "", 0);
check_decode(enc::UTF_8, &[b"xyz"], "xyz", 0);
check_decode(enc::UTF_8, &[b"x", b"y", b"z"], "xyz", 0);
check_decode(enc::UTF_8, &[b"\xEA\x99\xAE"], "\u{a66e}", 0);
check_decode(enc::UTF_8, &[b"\xEA", b"\x99\xAE"], "\u{a66e}", 0);
check_decode(enc::UTF_8, &[b"\xEA\x99", b"\xAE"], "\u{a66e}", 0);
check_decode(enc::UTF_8, &[b"\xEA", b"\x99", b"\xAE"], "\u{a66e}", 0);
check_decode(enc::UTF_8, &[b"\xEA", b"", b"\x99", b"", b"\xAE"], "\u{a66e}", 0);
check_decode(enc::UTF_8, &[b"", b"\xEA", b"", b"\x99", b"", b"\xAE", b""], "\u{a66e}", 0);
check_decode(enc::UTF_8, &[b"xy\xEA", b"\x99\xAEz"], "xy\u{a66e}z", 0);
check_decode(enc::UTF_8, &[b"xy\xEA", b"\xFF", b"\x99\xAEz"],
"xy\u{fffd}\u{fffd}\u{fffd}\u{fffd}z", 4);
check_decode(enc::UTF_8, &[b"xy\xEA\x99", b"\xFFz"],
"xy\u{fffd}\u{fffd}z", 2);
check_decode(enc::UTF_8, &[b"\xC0"], "\u{fffd}", 1);
check_decode(enc::UTF_8, &[b"\xEA\x99"], "\u{fffd}", 1);
}
#[test]
#[cfg(feature = "encoding")]
fn decode_koi8_u() {
check_decode(enc::KOI8_U, &[b"\xfc\xce\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0);
check_decode(enc::KOI8_U, &[b"\xfc\xce", b"\xc5\xd2\xc7\xc9\xd1"], "Энергия", 0);
check_decode(enc::KOI8_U, &[b"\xfc\xce", b"\xc5\xd2\xc7", b"\xc9\xd1"], "Энергия", 0);
check_decode(enc::KOI8_U, &[b"\xfc\xce", b"", b"\xc5\xd2\xc7", b"\xc9\xd1", b""], "Энергия", 0);
}
#[test]
#[cfg(feature = "encoding")]
fn decode_windows_949() {
check_decode(enc::WINDOWS_949, &[], "", 0);
check_decode(enc::WINDOWS_949, &[b""], "", 0);
check_decode(enc::WINDOWS_949, &[b"\xbe\xc8\xb3\xe7"], "안녕", 0);
check_decode(enc::WINDOWS_949, &[b"\xbe", b"\xc8\xb3\xe7"], "안녕", 0);
check_decode(enc::WINDOWS_949, &[b"\xbe", b"", b"\xc8\xb3\xe7"], "안녕", 0);
check_decode(enc::WINDOWS_949, &[b"\xbe\xc8\xb3\xe7\xc7\xcf\xbc\xbc\xbf\xe4"],
"안녕하세요", 0);
check_decode(enc::WINDOWS_949, &[b"\xbe\xc8\xb3\xe7\xc7"], "안녕\u{fffd}", 1);
check_decode(enc::WINDOWS_949, &[b"\xbe", b"", b"\xc8\xb3"], "안\u{fffd}", 1);
check_decode(enc::WINDOWS_949, &[b"\xbe\x28\xb3\xe7"], "\u{fffd}(녕", 1);
}
#[test]
fn read_from() {
let decoder = Utf8LossyDecoder::new(Accumulate::<NonAtomic>::new());
let mut bytes: &[u8] = b"foo\xffbar";
let (tendrils, errors) = decoder.read_from(&mut bytes).unwrap();
assert_eq!(&*tendrils.iter().map(|t| &**t).collect::<Vec<_>>(),
&["foo", "\u{FFFD}", "bar"]);
assert_eq!(errors, &["invalid byte sequence"]);
}
}