use crate::{CharsetDetector, Error as DetectorError};
use encoding_rs::Decoder;
use reader_ext::Rewind;
use std::io::{Read, Seek};
#[derive(Debug, thiserror::Error)]
pub enum EncodingError {
#[error(transparent)]
IoError(#[from] std::io::Error),
#[error("字符编码错误:{0}")]
CharsetError(String),
#[error(transparent)]
DetectorError(#[from] DetectorError),
}
pub struct AutoEncodingReader<R: Read> {
reader: R,
buffer: Box<[u8; 8 * 1024]>,
read_buffer: Vec<u8>,
write_buffer: Vec<u8>,
decoder: Decoder,
had_replacement_or_cant_map: bool,
transcode_done: bool,
eof: bool,
no_transcoding_needed: bool,
}
impl<R: Read + Seek> Rewind for AutoEncodingReader<R> {
fn try_rewind(&mut self) -> std::io::Result<()> {
self.reader.rewind()?;
self.read_buffer.clear();
self.write_buffer.clear();
self.decoder = self.decoder.encoding().new_decoder_without_bom_handling();
self.had_replacement_or_cant_map = false;
self.transcode_done = false;
self.eof = false;
Ok(())
}
}
impl<R: Read> AutoEncodingReader<R> {
fn new_with_decoder(
reader: R,
decoder: Decoder,
mut initial_data: Vec<u8>,
mut decoded_data: Vec<u8>,
) -> Self {
let no_transcoding_needed = decoder.encoding().name() == "UTF-8";
if no_transcoding_needed {
if initial_data
.windows(3)
.next()
.is_some_and(|maybe_bom| maybe_bom == b"\xef\xbb\xbf")
{
initial_data.drain(..3);
}
if decoded_data.is_empty() {
(initial_data, decoded_data) = (decoded_data, initial_data)
} else {
decoded_data.append(&mut initial_data);
}
}
Self {
reader,
buffer: Box::new([0u8; 8 * 1024]),
read_buffer: initial_data,
write_buffer: decoded_data,
decoder,
had_replacement_or_cant_map: false,
transcode_done: false,
eof: false,
no_transcoding_needed,
}
}
pub fn new_with_fallbacks(
mut reader: R,
fallbacks: &[&'static encoding_rs::Encoding],
) -> Result<Self, EncodingError> {
let mut buf = vec![0u8; 8 * 1024];
let n = reader.read(&mut buf)?;
let eof = n < buf.len();
buf.truncate(n);
if n == 0 {
let decoder = encoding_rs::UTF_8.new_decoder_without_bom_handling();
return Ok(Self::new_with_decoder(reader, decoder, buf, vec![]));
}
let candidates = CharsetDetector::detect_data(&buf)?;
let best_candidate = candidates.best();
if let Some(candidate) = best_candidate {
let name = candidate.encoding_name()?;
let encoding = crate::encoding::to_standard(name)
.or_else(|| encoding_rs::Encoding::for_label(name.as_bytes()));
if let Some(enc) = encoding {
let decoder = enc.new_decoder_without_bom_handling();
return Ok(Self::new_with_decoder(reader, decoder, buf, vec![]));
} else {
}
}
if eof {
let mut buf_ = Vec::new();
for &fallback in fallbacks {
let mut reader_ = AutoEncodingReader::new_with_decoder(
&*buf,
fallback.new_decoder(),
vec![],
Vec::with_capacity(5 * 512),
);
buf_.clear();
match reader_.read_to_end(&mut buf_) {
Ok(_) => {
return Ok(AutoEncodingReader::new_with_decoder(
reader,
reader_.decoder,
vec![],
buf_,
));
}
Err(_) => {
continue;
}
}
}
}
Err(EncodingError::CharsetError(
"未能检测到合适的字符编码,且所有后备编码均失败。".to_owned(),
))
}
#[inline]
pub fn new(reader: R) -> Result<Self, EncodingError> {
let fallbacks = [encoding_rs::GB18030, encoding_rs::GBK, encoding_rs::BIG5];
Self::new_with_fallbacks(reader, &fallbacks)
}
fn copy_from_write_buffer_to(&mut self, buffer: &mut [u8]) -> usize {
let min = std::cmp::min(buffer.len(), self.write_buffer.len());
buffer[..min].copy_from_slice(&self.write_buffer[..min]);
self.write_buffer = self.write_buffer[min..].to_vec();
min
}
fn decode(&mut self, buffer: &mut [u8]) -> usize {
if self.read_buffer.is_empty() && !self.eof {
return 0;
}
if buffer.len() > 1024 {
let (coder_result, num_read, num_written, has_replacement) = self
.decoder
.decode_to_utf8(&self.read_buffer, buffer, self.eof);
self.read_buffer = self.read_buffer[num_read..].to_vec();
self.had_replacement_or_cant_map |= has_replacement;
self.transcode_done =
(coder_result == encoding_rs::CoderResult::InputEmpty) && self.eof;
return num_written;
}
self.write_buffer.clear();
self.write_buffer.resize(8 * 1024, 0);
let (coder_result, num_read, num_written, has_replacement) =
self.decoder
.decode_to_utf8(&self.read_buffer, &mut self.write_buffer, self.eof);
self.read_buffer = self.read_buffer[num_read..].to_vec();
self.write_buffer.truncate(num_written);
self.had_replacement_or_cant_map |= has_replacement;
self.transcode_done = (coder_result == encoding_rs::CoderResult::InputEmpty) && self.eof;
if num_written > 0 {
return self.copy_from_write_buffer_to(buffer);
}
0
}
pub fn had_replacement_or_cant_map(&self) -> bool {
self.had_replacement_or_cant_map
}
}
impl<R: Read> Read for AutoEncodingReader<R> {
fn read(&mut self, buffer: &mut [u8]) -> std::io::Result<usize> {
if buffer.is_empty() {
return Ok(0);
}
if !self.write_buffer.is_empty() {
return Ok(self.copy_from_write_buffer_to(buffer));
}
if self.no_transcoding_needed {
let n = self.reader.read(buffer)?;
return Ok(n);
}
if self.transcode_done {
return Ok(0);
}
if !self.read_buffer.is_empty() {
let num_written = self.decode(buffer);
if num_written > 0 {
return Ok(num_written);
}
}
let n = self.reader.read(self.buffer.as_mut_slice())?;
self.read_buffer.extend_from_slice(&self.buffer[..n]);
self.eof = n == 0;
let num_written = self.decode(buffer);
Ok(num_written)
}
}