use super::{ContentType, IncompleteUtf8Resync, TextEncoder, Utf8Error};
use crate::html::escape_body_text;
use encoding_rs::{Encoding, UTF_8};
pub struct StreamingHandlerSink<'output_handler> {
incomplete_utf8: IncompleteUtf8Resync,
inner: StreamingHandlerSinkInner<'output_handler>,
}
struct StreamingHandlerSinkInner<'output_handler> {
non_utf8_encoder: Option<TextEncoder>,
output_handler: &'output_handler mut dyn FnMut(&[u8]),
}
impl<'output_handler> StreamingHandlerSink<'output_handler> {
#[inline(always)]
pub(crate) fn new(
encoding: &'static Encoding,
output_handler: &'output_handler mut dyn FnMut(&[u8]),
) -> Self {
Self {
incomplete_utf8: IncompleteUtf8Resync::new(),
inner: StreamingHandlerSinkInner {
non_utf8_encoder: (encoding != UTF_8).then(|| TextEncoder::new(encoding)),
output_handler,
},
}
}
#[inline]
pub fn write_str(&mut self, content: &str, content_type: ContentType) {
if self.incomplete_utf8.discard_incomplete() {
self.inner.write_html("\u{FFFD}");
}
self.inner.write_str(content, content_type);
}
#[inline]
pub(crate) fn output_handler(&mut self) -> &mut dyn FnMut(&[u8]) {
&mut self.inner.output_handler
}
#[inline]
pub fn write_utf8_chunk(
&mut self,
content: &[u8],
content_type: ContentType,
) -> Result<(), Utf8Error> {
self.incomplete_utf8
.write_utf8_chunk(content, |valid_chunk| {
self.inner.write_str(valid_chunk, content_type);
})
}
}
impl StreamingHandlerSinkInner<'_> {
#[inline]
pub(crate) fn write_str(&mut self, content: &str, content_type: ContentType) {
match content_type {
ContentType::Html => self.write_html(content),
ContentType::Text => self.write_body_text(content),
}
}
pub(crate) fn write_html(&mut self, html: &str) {
if !html.is_empty() {
if let Some(encoder) = &mut self.non_utf8_encoder {
encoder.encode(html, self.output_handler);
} else {
(self.output_handler)(html.as_bytes());
}
}
}
pub(crate) fn write_body_text(&mut self, plaintext: &str) {
if let Some(encoder) = &mut self.non_utf8_encoder {
escape_body_text(plaintext, &mut |chunk| {
debug_assert!(!chunk.is_empty());
encoder.encode(chunk, self.output_handler);
});
} else {
escape_body_text(plaintext, &mut |chunk| {
debug_assert!(!chunk.is_empty());
(self.output_handler)(chunk.as_bytes());
});
}
}
}
#[test]
fn utf8_fragments() {
let text = "🐈°文字化けしない ▀▄ ɯopuɐɹ ⓤⓝⓘⓒⓞⓓⓔ and ascii 🐳 sʇuıodǝpoɔ ✴";
for with_zero_writes in [false, true] {
for len in 1..9 {
let mut out = Vec::new();
let mut handler = |ch: &[u8]| out.extend_from_slice(ch);
let mut t = StreamingHandlerSink::new(UTF_8, &mut handler);
for (nth, chunk) in text.as_bytes().chunks(len).enumerate() {
let msg =
format!("{len} at {nth} '{chunk:?}'; with_zero_writes={with_zero_writes}");
if with_zero_writes {
t.write_utf8_chunk(b"", ContentType::Text).expect(&msg);
}
t.write_utf8_chunk(chunk, ContentType::Html).expect(&msg);
}
drop(t);
assert_eq!(String::from_utf8_lossy(&out), text, "{len}");
}
}
}
#[test]
fn long_text() {
let mut written = 0;
let mut expected = 0;
let mut handler = |ch: &[u8]| {
assert!(
ch.iter().all(|&c| {
written += 1;
c == if 0 != written & 1 {
177
} else {
b'0' + ((written / 2 - 1) % 10) as u8
}
}),
"@{written} {ch:?}"
);
};
let mut t = StreamingHandlerSink::new(encoding_rs::ISO_8859_2, &mut handler);
let mut s = "ą0ą1ą2ą3ą4ą5ą6ą7ą8ą9".repeat(128);
let mut split_point = 1;
while s.len() <= 1 << 17 {
s.push_str(&s.clone());
expected += s.chars().count();
let (a, b) = s.as_bytes().split_at(split_point);
split_point += 13;
t.write_utf8_chunk(a, ContentType::Text).unwrap();
t.write_utf8_chunk(b, ContentType::Html).unwrap();
}
assert_eq!(expected, written);
}
#[test]
fn invalid_utf8_fragments() {
#[rustfmt::skip]
let broken_utf8 = &[
&b"\x31\x32\x33\xED\xA0\x80\x31"[..], b"\x31\x32\x33\xEF\x80", b"\x31\x32\x33\xEF\x80\xF0\x3c",
b"\x37\x38\x39\xFE", b"\x37\x38\xFE", b"\x37\xFF", b"\x3c\x23\x24\xFE\x3C", b"\x3C\x23\xFE\x3C\x3C",
b"\x3C\x3D\xE0\x80\x3C", b"\x3C\x3D\xE0\x80\xAF\x3C", b"\x3C\x3D\xE0\x80\xE0\x80\x3C",
b"\x3C\x3D\xED\xA0\x80\x3C", b"\x3C\x3D\xF0\x80\x80\x3C", b"\x3C\x3D\xF0\x80\x80\x80\x3C",
b"\x3C\x3D\xF7\xBF\xBF\xBF\x3C", b"\x3C\x3D\xFF\x3C", b"\x7F", b"\x80", b"\x80\x3C",
b"\x80\x81\x82\x83\x84\x85\x86\x87", b"\x80\xBF", b"\x80\xBF\x80", b"\x80\xBF\x80\xBF",
b"\x80\xBF\x80\xBF\x80", b"\x80\xBF\x80\xBF\x80\xBF", b"\x81", b"\x81\x3C",
b"\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F", b"\x90\x91\x92\x93\x94\x95\x96\x97", b"\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F",
b"\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7", b"\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF", b"\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7",
b"\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF", b"\xBF", b"\xC0", b"\xC0\x3C\xC1\x3C\xC2\x3C\xC3\x3C", b"\xC0\x80",
b"\xC0\xAF", b"\xC0\xAF\xE0\x80\xBF\xF0\x81\x82\x41", b"\xC1\x3C", b"\xC1\xBF", b"\xC1\xBF", b"\xC2\x00",
b"\xC2\x41\x42", b"\xC2\x7F", b"\xC2\xC0", b"\xC2\xFF", b"\xC4\x3C\xC5\x3C\xC6\x3C\xC7\x3C",
b"\xC8\x3C\xC9\x3C\xCA\x3C\xCB\x3C", b"\xCC\x3C\xCD\x3C\xCE\x3C\xCF\x3C", b"\xD0\x3C\xD1\x3C\xD2\x3C\xD3\x3C",
b"\xD4\x3C\xD5\x3C\xD6\x3C\xD7\x3C", b"\xD8\x3C\xD9\x3C\xDA\x3C\xDB\x3C", b"\xDC\x3C\xDD\x3C\xDE\x3C\xDF\x3C",
b"\xDF", b"\xDF\x00", b"\xDF\x7F", b"\xDF\xC0", b"\xDF\xFF", b"\xE0\x3C\xE1\x3C\xE2\x3C\xE3\x3C", b"\xE0\x80",
b"\xE0\x80\x00", b"\xE0\x80\x7F", b"\xE0\x80\x80", b"\xE0\x80\xAF", b"\xE0\x80\xC0", b"\xE0\x80\xFF",
b"\xE0\x81\xBF", b"\xE0\x9F\xBF", b"\xE1\x80\xE2\xF0\x91\x92\xF1\xBF\x41",
b"\xE4\x3C\xE5\x3C\xE6\x3C\xE7\x3C", b"\xE8\x3C\xE9\x3C\xEA\x3C\xEB\x3C", b"\xEC\x3C\xED\x3C\xEE\x3C\xEF\x3C",
b"\xED\x80\x00", b"\xED\x80\x7F", b"\xED\x80\xC0", b"\xED\x80\xFF", b"\xED\xA0\x80", b"\xED\xA0\x80\x35",
b"\xED\xA0\x80\xED\xB0\x80", b"\xED\xA0\x80\xED\xBF\xBF", b"\xED\xA0\x80\xED\xBF\xBF\xED\xAF\x41",
b"\xED\xAD\xBF", b"\xED\xAD\xBF\xED\xB0\x80", b"\xED\xAD\xBF\xED\xBF\xBF", b"\xED\xAE\x80",
b"\xED\xAE\x80\xED\xB0\x80", b"\xED\xAE\x80\xED\xBF\xBF", b"\xED\xAF\xBF", b"\xED\xAF\xBF\xED\xB0\x80",
b"\xED\xAF\xBF\xED\xBF\xBF", b"\xED\xB0\x80", b"\xED\xBE\x80", b"\xED\xBF\xBF", b"\xEF\xBF",
b"\xF0\x3C\xF1\x3C", b"\xF0\x80\x80", b"\xF0\x80\x80\x80", b"\xF0\x80\x80\xAF", b"\xF0\x80\x81\xBF",
b"\xF0\x8F\xBF\xBF", b"\xF0\x90\x80\x00", b"\xF0\x90\x80\x7F", b"\xF0\x90\x80\xC0", b"\xF0\x90\x80\xFF",
b"\xF1\x80\x80\x00", b"\xF1\x80\x80\x7F", b"\xF1\x80\x80\xC0", b"\xF1\x80\x80\xFF", b"\xF2\x3C\xF3\x3C",
b"\xF4\x3C\xF5\x3C", b"\xF4\x80\x80\x00", b"\xF4\x80\x80\x7F", b"\xF4\x80\x80\xC0", b"\xF4\x80\x80\xFF",
b"\xF4\x90\x80\x80", b"\xF4\x91\x92\x93\xFF\x41\x80\xBF\x42", b"\xF5\x3C", b"\xF6\x3C\xF7\x3C",
b"\xF7\xBF\xBF", b"\xF7\xBF\xBF\xBF", b"\xF7\xBF\xBF\xBF\xBF", b"\xF7\xBF\xBF\xBF\xBF\xBF",
b"\xF7\xBF\xBF\xBF\xBF\xBF\xBF", b"\xF8\x3C", b"\xF8\x80\x80\x80", b"\xF8\x80\x80\x80\xAF",
b"\xF8\x87\xBF\xBF\xBF", b"\xF8\x88\x80\x80\x80", b"\xF9\x3C", b"\xFA\x3C", b"\xFB\x3C", b"\xFB\xBF\xBF\xBF",
b"\xFC\x3C", b"\xFC\x80\x80\x80\x80", b"\xFC\x80\x80\x80\x80\xAF", b"\xFC\x84\x80\x80\x80\x80", b"\xFD\x3C",
b"\xFD\xBF\xBF\xBF\xBF", b"\xFE", b"\xFF", b"\xFF\x3C"
];
for bad in broken_utf8 {
'next: for len in 1..bad.len() {
let mut handler = |ch: &[u8]| {
assert!(
!std::str::from_utf8(ch).unwrap().contains('<'),
"{ch:x?} of {bad:x?}"
);
};
let mut t = StreamingHandlerSink::new(UTF_8, &mut handler);
for chunk in bad.chunks(len) {
if t.write_utf8_chunk(chunk, ContentType::Text).is_err() {
continue 'next;
}
}
assert!(
t.write_utf8_chunk(b"<", ContentType::Text).is_err(),
"Shouldn't have allowed {bad:?} {}",
String::from_utf8_lossy(bad)
);
}
}
}