1use crate::{CharsetDetector, Error as DetectorError};
24use encoding_rs::Decoder;
25use reader_ext::Rewind;
26use std::io::{Read, Seek};
27
28#[derive(Debug, thiserror::Error)]
30pub enum EncodingError {
31 #[error(transparent)]
33 IoError(#[from] std::io::Error),
34
35 #[error("字符编码错误:{0}")]
37 CharsetError(String),
38
39 #[error(transparent)]
41 DetectorError(#[from] DetectorError),
42}
43
44pub struct AutoEncodingReader<R: Read> {
49 reader: R,
51 buffer: Box<[u8]>,
53 read_buffer: Vec<u8>,
55 write_buffer: Vec<u8>,
57 encoding_name: Option<String>,
59 decoder: Decoder,
61 had_replacement_or_cant_map: bool,
63 transcode_done: bool,
65 eof: bool,
67 no_transcoding_needed: bool,
69}
70
71impl<R: Read + Seek> Rewind for AutoEncodingReader<R> {
72 fn try_rewind(&mut self) -> std::io::Result<()> {
73 self.reader.rewind()?;
74 self.read_buffer.clear();
75 self.write_buffer.clear();
76 self.decoder = self.decoder.encoding().new_decoder();
78 self.had_replacement_or_cant_map = false;
79 self.transcode_done = false;
80 self.eof = false;
81 Ok(())
82 }
83}
84
85impl<R: Read> AutoEncodingReader<R> {
86 pub(crate) fn new_with_decoder(
88 reader: R,
89 encoding_name: Option<String>,
90 decoder: Decoder,
91 initial_data: Vec<u8>,
92 decoded_data: Vec<u8>,
93 read_buffer_size: usize,
94 ) -> Self {
95 let no_transcoding_needed = decoder.encoding().name().eq_ignore_ascii_case("UTF-8");
96 let (mut initial_data, mut decoded_data) = (initial_data, decoded_data);
97 if no_transcoding_needed {
98 if initial_data
100 .windows(3)
101 .next()
102 .is_some_and(|maybe_bom| maybe_bom == b"\xef\xbb\xbf")
103 {
104 initial_data.drain(..3);
105 }
106 if decoded_data.is_empty() {
107 (initial_data, decoded_data) = (decoded_data, initial_data);
108 } else {
109 decoded_data.append(&mut initial_data);
110 }
111 }
112 let buffer = vec![0u8; read_buffer_size].into_boxed_slice();
114 Self {
115 reader,
116 encoding_name,
117 buffer,
118 read_buffer: initial_data,
119 write_buffer: decoded_data,
120 decoder,
121 had_replacement_or_cant_map: false,
122 transcode_done: false,
123 eof: false,
124 no_transcoding_needed,
125 }
126 }
127
128 #[inline]
132 pub fn new(reader: R) -> Result<Self, EncodingError> {
133 AutoEncodingReaderBuilder::with_reader(reader)
134 .fallbacks(&[encoding_rs::GB18030, encoding_rs::GBK, encoding_rs::BIG5])
135 .language_weight("zh", 2.0)
136 .build()
137 }
138
139 fn copy_from_write_buffer_to(&mut self, buffer: &mut [u8]) -> usize {
141 let min = std::cmp::min(buffer.len(), self.write_buffer.len());
142 buffer[..min].copy_from_slice(&self.write_buffer[..min]);
143 self.write_buffer = self.write_buffer[min..].to_vec();
144 min
145 }
146
147 fn decode(&mut self, buffer: &mut [u8]) -> usize {
151 if self.read_buffer.is_empty() && !self.eof {
152 return 0;
153 }
154
155 if buffer.len() > 1024 {
156 let (coder_result, num_read, num_written, has_replacement) = self
158 .decoder
159 .decode_to_utf8(&self.read_buffer, buffer, self.eof);
160 self.read_buffer = self.read_buffer[num_read..].to_vec();
161 self.had_replacement_or_cant_map |= has_replacement;
162 self.transcode_done =
163 (coder_result == encoding_rs::CoderResult::InputEmpty) && self.eof;
164 return num_written;
165 }
166
167 self.write_buffer.clear();
169 self.write_buffer.resize(8 * 1024, 0);
170 let (coder_result, num_read, num_written, has_replacement) =
171 self.decoder
172 .decode_to_utf8(&self.read_buffer, &mut self.write_buffer, self.eof);
173 self.read_buffer = self.read_buffer[num_read..].to_vec();
174 self.write_buffer.truncate(num_written);
175 self.had_replacement_or_cant_map |= has_replacement;
176 self.transcode_done = (coder_result == encoding_rs::CoderResult::InputEmpty) && self.eof;
177 if num_written > 0 {
178 return self.copy_from_write_buffer_to(buffer);
179 }
180 0
181 }
182
183 pub fn had_replacement_or_cant_map(&self) -> bool {
187 self.had_replacement_or_cant_map
188 }
189
190 pub fn decoder(&self) -> &Decoder {
192 &self.decoder
193 }
194
195 pub fn encoding_name(&self) -> &Option<String> {
197 &self.encoding_name
198 }
199}
200
201impl<R: Read> Read for AutoEncodingReader<R> {
202 fn read(&mut self, buffer: &mut [u8]) -> std::io::Result<usize> {
203 if buffer.is_empty() {
204 return Ok(0);
205 }
206
207 if !self.write_buffer.is_empty() {
209 return Ok(self.copy_from_write_buffer_to(buffer));
210 }
211
212 if self.no_transcoding_needed {
214 let n = self.reader.read(buffer)?;
223 return Ok(n);
224 }
225
226 if self.transcode_done {
228 return Ok(0);
229 }
230
231 if !self.read_buffer.is_empty() {
233 let num_written = self.decode(buffer);
234 if num_written > 0 {
235 return Ok(num_written);
236 }
237 }
238
239 let n = self.reader.read(self.buffer.as_mut())?;
241 self.read_buffer.extend_from_slice(&self.buffer[..n]);
242 self.eof = n == 0;
243 let num_written = self.decode(buffer);
244 Ok(num_written)
245 }
246}
247
248pub struct AutoEncodingReaderBuilder<R> {
250 reader: R,
251 fallbacks: Vec<&'static encoding_rs::Encoding>,
252 detect_buffer_size: usize,
253 read_buffer_size: usize,
254 language_weights: Vec<(String, f32)>,
255 default_weight: Option<f32>,
256}
257
258impl<R: Read> AutoEncodingReaderBuilder<R> {
259 pub fn with_reader(reader: R) -> Self {
261 Self {
262 reader,
263 fallbacks: Vec::new(),
264 detect_buffer_size: 8192,
265 read_buffer_size: 8192,
266 language_weights: Vec::new(),
267 default_weight: None,
268 }
269 }
270
271 pub fn fallbacks(mut self, fallbacks: &[&'static encoding_rs::Encoding]) -> Self {
273 self.fallbacks = fallbacks.to_vec();
274 self
275 }
276
277 pub fn detect_buffer_size(mut self, size: usize) -> Self {
279 self.detect_buffer_size = size;
280 self
281 }
282
283 pub fn read_buffer_size(mut self, size: usize) -> Self {
285 self.read_buffer_size = size;
286 self
287 }
288
289 pub fn language_weight(mut self, language: &str, weight: f32) -> Self {
291 self.language_weights.push((language.to_owned(), weight));
292 self
293 }
294
295 pub fn default_weight(mut self, weight: f32) -> Self {
297 self.default_weight = Some(weight);
298 self
299 }
300
301 pub fn build(self) -> Result<AutoEncodingReader<R>, EncodingError> {
303 let mut reader = self.reader;
304
305 let mut buf = vec![0u8; self.detect_buffer_size];
307 let n = reader.read(&mut buf)?;
308 let eof = n < buf.len();
309 buf.truncate(n);
310
311 if n == 0 {
312 let decoder = encoding_rs::UTF_8.new_decoder_without_bom_handling();
314 return Ok(AutoEncodingReader::new_with_decoder(
315 reader,
316 Some("UTF-8".to_owned()),
317 decoder,
318 buf,
319 vec![],
320 self.read_buffer_size,
321 ));
322 }
323
324 let mut detector = CharsetDetector::new();
326 for (lang, weight) in &self.language_weights {
327 detector.weigh_language(lang, *weight)?;
328 }
329 if let Some(w) = self.default_weight {
330 detector.set_default_weight(w);
331 }
332
333 detector.feed_data(&buf)?;
334 let candidates = detector.detect();
335 let best_candidate = candidates.best();
336
337 if let Some(candidate) = best_candidate {
338 let name = candidate.encoding_name()?;
339 let encoding = crate::encoding::as_whatwg(name)
340 .or_else(|| encoding_rs::Encoding::for_label(name.as_bytes()));
341 if let Some(enc) = encoding {
342 let decoder = enc.new_decoder();
343 return Ok(AutoEncodingReader::new_with_decoder(
344 reader,
345 Some(name.to_owned()),
346 decoder,
347 buf,
348 vec![],
349 self.read_buffer_size,
350 ));
351 }
352 }
353
354 if eof {
356 let mut decoded = Vec::new();
357 for &fallback in &self.fallbacks {
358 let mut tmp_reader = AutoEncodingReader::new_with_decoder(
359 &*buf,
360 None,
361 fallback.new_decoder(),
362 vec![],
363 Vec::with_capacity(5 * 512),
364 self.read_buffer_size,
365 );
366 decoded.clear();
367 if tmp_reader.read_to_end(&mut decoded).is_ok() {
368 return Ok(AutoEncodingReader::new_with_decoder(
369 reader,
370 None,
371 fallback.new_decoder(),
372 vec![],
373 decoded,
374 self.read_buffer_size,
375 ));
376 }
377 }
378 }
379
380 Err(EncodingError::CharsetError(
381 "未能检测到合适的字符编码,且所有后备编码均失败。".to_owned(),
382 ))
383 }
384}