use crate::{CharsetDetector, Error as DetectorError};
use encoding_rs::Decoder;
use reader_ext::Rewind;
use std::io::{Read, Seek};
#[derive(Debug, thiserror::Error)]
pub enum EncodingError {
#[error(transparent)]
IoError(#[from] std::io::Error),
#[error("字符编码错误:{0}")]
CharsetError(String),
#[error(transparent)]
DetectorError(#[from] DetectorError),
}
pub struct AutoEncodingReader<R: Read> {
reader: R,
buffer: Box<[u8]>,
read_buffer: Vec<u8>,
write_buffer: Vec<u8>,
decoder: Decoder,
had_replacement_or_cant_map: bool,
transcode_done: bool,
eof: bool,
no_transcoding_needed: bool,
}
impl<R: Read + Seek> Rewind for AutoEncodingReader<R> {
fn try_rewind(&mut self) -> std::io::Result<()> {
self.reader.rewind()?;
self.read_buffer.clear();
self.write_buffer.clear();
self.decoder = self.decoder.encoding().new_decoder();
self.had_replacement_or_cant_map = false;
self.transcode_done = false;
self.eof = false;
Ok(())
}
}
impl<R: Read> AutoEncodingReader<R> {
pub(crate) fn new_with_decoder(
reader: R,
decoder: Decoder,
initial_data: Vec<u8>,
decoded_data: Vec<u8>,
read_buffer_size: usize,
) -> Self {
let no_transcoding_needed = decoder.encoding().name() == "UTF-8";
let (mut initial_data, mut decoded_data) = (initial_data, decoded_data);
if no_transcoding_needed {
if initial_data
.windows(3)
.next()
.is_some_and(|maybe_bom| maybe_bom == b"\xef\xbb\xbf")
{
initial_data.drain(..3);
}
if decoded_data.is_empty() {
(initial_data, decoded_data) = (decoded_data, initial_data);
} else {
decoded_data.append(&mut initial_data);
}
}
let buffer = vec![0u8; read_buffer_size].into_boxed_slice();
Self {
reader,
buffer,
read_buffer: initial_data,
write_buffer: decoded_data,
decoder,
had_replacement_or_cant_map: false,
transcode_done: false,
eof: false,
no_transcoding_needed,
}
}
pub fn new_with_fallbacks(
reader: R,
fallbacks: &[&'static encoding_rs::Encoding],
detect_buffer_size: usize,
read_buffer_size: usize,
) -> Result<Self, EncodingError> {
AutoEncodingReaderBuilder::with_reader(reader)
.fallbacks(fallbacks)
.detect_buffer_size(detect_buffer_size)
.read_buffer_size(read_buffer_size)
.build()
}
#[inline]
pub fn new_with_fallbacks_default(
reader: R,
fallbacks: &[&'static encoding_rs::Encoding],
) -> Result<Self, EncodingError> {
Self::new_with_fallbacks(reader, fallbacks, 8192, 8192)
}
#[inline]
pub fn new(reader: R) -> Result<Self, EncodingError> {
let fallbacks = [encoding_rs::GB18030, encoding_rs::GBK, encoding_rs::BIG5];
Self::new_with_fallbacks_default(reader, &fallbacks)
}
fn copy_from_write_buffer_to(&mut self, buffer: &mut [u8]) -> usize {
let min = std::cmp::min(buffer.len(), self.write_buffer.len());
buffer[..min].copy_from_slice(&self.write_buffer[..min]);
self.write_buffer = self.write_buffer[min..].to_vec();
min
}
fn decode(&mut self, buffer: &mut [u8]) -> usize {
if self.read_buffer.is_empty() && !self.eof {
return 0;
}
if buffer.len() > 1024 {
let (coder_result, num_read, num_written, has_replacement) = self
.decoder
.decode_to_utf8(&self.read_buffer, buffer, self.eof);
self.read_buffer = self.read_buffer[num_read..].to_vec();
self.had_replacement_or_cant_map |= has_replacement;
self.transcode_done =
(coder_result == encoding_rs::CoderResult::InputEmpty) && self.eof;
return num_written;
}
self.write_buffer.clear();
self.write_buffer.resize(8 * 1024, 0);
let (coder_result, num_read, num_written, has_replacement) =
self.decoder
.decode_to_utf8(&self.read_buffer, &mut self.write_buffer, self.eof);
self.read_buffer = self.read_buffer[num_read..].to_vec();
self.write_buffer.truncate(num_written);
self.had_replacement_or_cant_map |= has_replacement;
self.transcode_done = (coder_result == encoding_rs::CoderResult::InputEmpty) && self.eof;
if num_written > 0 {
return self.copy_from_write_buffer_to(buffer);
}
0
}
pub fn had_replacement_or_cant_map(&self) -> bool {
self.had_replacement_or_cant_map
}
pub fn decoder(&self) -> &Decoder {
&self.decoder
}
pub fn encoding(&self) -> &'static encoding_rs::Encoding {
self.decoder.encoding()
}
}
impl<R: Read> Read for AutoEncodingReader<R> {
fn read(&mut self, buffer: &mut [u8]) -> std::io::Result<usize> {
if buffer.is_empty() {
return Ok(0);
}
if !self.write_buffer.is_empty() {
return Ok(self.copy_from_write_buffer_to(buffer));
}
if self.no_transcoding_needed {
let n = self.reader.read(buffer)?;
return Ok(n);
}
if self.transcode_done {
return Ok(0);
}
if !self.read_buffer.is_empty() {
let num_written = self.decode(buffer);
if num_written > 0 {
return Ok(num_written);
}
}
let n = self.reader.read(self.buffer.as_mut())?;
self.read_buffer.extend_from_slice(&self.buffer[..n]);
self.eof = n == 0;
let num_written = self.decode(buffer);
Ok(num_written)
}
}
pub struct AutoEncodingReaderBuilder<R> {
reader: R,
fallbacks: Vec<&'static encoding_rs::Encoding>,
detect_buffer_size: usize,
read_buffer_size: usize,
language_weights: Vec<(String, f32)>,
default_weight: Option<f32>,
}
impl<R: Read> AutoEncodingReaderBuilder<R> {
pub fn with_reader(reader: R) -> Self {
Self {
reader,
fallbacks: Vec::new(),
detect_buffer_size: 8192,
read_buffer_size: 8192,
language_weights: Vec::new(),
default_weight: None,
}
}
pub fn fallbacks(mut self, fallbacks: &[&'static encoding_rs::Encoding]) -> Self {
self.fallbacks = fallbacks.to_vec();
self
}
pub fn detect_buffer_size(mut self, size: usize) -> Self {
self.detect_buffer_size = size;
self
}
pub fn read_buffer_size(mut self, size: usize) -> Self {
self.read_buffer_size = size;
self
}
pub fn language_weight(mut self, language: &str, weight: f32) -> Self {
self.language_weights.push((language.to_owned(), weight));
self
}
pub fn default_weight(mut self, weight: f32) -> Self {
self.default_weight = Some(weight);
self
}
pub fn build(self) -> Result<AutoEncodingReader<R>, EncodingError> {
let mut reader = self.reader;
let mut buf = vec![0u8; self.detect_buffer_size];
let n = reader.read(&mut buf)?;
let eof = n < buf.len();
buf.truncate(n);
if n == 0 {
let decoder = encoding_rs::UTF_8.new_decoder_without_bom_handling();
return Ok(AutoEncodingReader::new_with_decoder(
reader,
decoder,
buf,
vec![],
self.read_buffer_size,
));
}
let mut detector = CharsetDetector::new();
for (lang, weight) in &self.language_weights {
detector.weigh_language(lang, *weight)?;
}
if let Some(w) = self.default_weight {
detector.set_default_weight(w);
}
detector.feed_data(&buf)?;
let candidates = detector.detect();
let best_candidate = candidates.best();
if let Some(candidate) = best_candidate {
let name = candidate.encoding_name()?;
let encoding = crate::encoding::to_standard(name)
.or_else(|| encoding_rs::Encoding::for_label(name.as_bytes()));
if let Some(enc) = encoding {
let decoder = enc.new_decoder();
return Ok(AutoEncodingReader::new_with_decoder(
reader,
decoder,
buf,
vec![],
self.read_buffer_size,
));
}
}
if eof {
let mut decoded = Vec::new();
for &fallback in &self.fallbacks {
let mut tmp_reader = AutoEncodingReader::new_with_decoder(
&*buf,
fallback.new_decoder(),
vec![],
Vec::with_capacity(5 * 512),
self.read_buffer_size,
);
decoded.clear();
if tmp_reader.read_to_end(&mut decoded).is_ok() {
return Ok(AutoEncodingReader::new_with_decoder(
reader,
tmp_reader.decoder,
vec![],
decoded,
self.read_buffer_size,
));
}
}
}
Err(EncodingError::CharsetError(
"未能检测到合适的字符编码,且所有后备编码均失败。".to_owned(),
))
}
}