uchardet_git/
auto_encoding_reader.rs1use crate::{CharsetDetector, Error as DetectorError};
2use encoding_rs::Decoder;
3use reader_ext::Rewind;
4use std::io::{Read, Seek};
5
6#[derive(Debug, thiserror::Error)]
8pub enum EncodingError {
9 #[error(transparent)]
11 IoError(#[from] std::io::Error),
12
13 #[error("字符编码错误:{0}")]
15 CharsetError(String),
16
17 #[error(transparent)]
19 DetectorError(#[from] DetectorError),
20}
21
22pub struct AutoEncodingReader<R: Read> {
27 reader: R,
29 buffer: Box<[u8; 8 * 1024]>,
31 read_buffer: Vec<u8>,
33 write_buffer: Vec<u8>,
35 decoder: Decoder,
37 had_replacement_or_cant_map: bool,
39 transcode_done: bool,
41 eof: bool,
43 no_transcoding_needed: bool,
45}
46
47impl<R: Read + Seek> Rewind for AutoEncodingReader<R> {
48 fn try_rewind(&mut self) -> std::io::Result<()> {
49 self.reader.rewind()?;
50 self.read_buffer.clear();
51 self.write_buffer.clear();
52 self.decoder = self.decoder.encoding().new_decoder_without_bom_handling();
54 self.had_replacement_or_cant_map = false;
55 self.transcode_done = false;
56 self.eof = false;
57 Ok(())
58 }
59}
60
61impl<R: Read> AutoEncodingReader<R> {
62 fn new_with_decoder(
64 reader: R,
65 decoder: Decoder,
66 mut initial_data: Vec<u8>,
67 mut decoded_data: Vec<u8>,
68 ) -> Self {
69 let no_transcoding_needed = decoder.encoding().name() == "UTF-8";
70 if no_transcoding_needed {
71 if initial_data
73 .windows(3)
74 .next()
75 .is_some_and(|maybe_bom| maybe_bom == b"\xef\xbb\xbf")
76 {
77 initial_data.drain(..3);
78 }
79 if decoded_data.is_empty() {
80 (initial_data, decoded_data) = (decoded_data, initial_data)
81 } else {
82 decoded_data.append(&mut initial_data);
83 }
84 }
85 Self {
86 reader,
87 buffer: Box::new([0u8; 8 * 1024]),
88 read_buffer: initial_data,
89 write_buffer: decoded_data,
90 decoder,
91 had_replacement_or_cant_map: false,
92 transcode_done: false,
93 eof: false,
94 no_transcoding_needed,
95 }
96 }
97
98 pub fn new_with_fallbacks(
109 mut reader: R,
110 fallbacks: &[&'static encoding_rs::Encoding],
111 ) -> Result<Self, EncodingError> {
112 let mut buf = vec![0u8; 8 * 1024];
114 let n = reader.read(&mut buf)?;
115 let eof = n < buf.len();
116 buf.truncate(n);
117
118 if n == 0 {
119 let decoder = encoding_rs::UTF_8.new_decoder_without_bom_handling();
121 return Ok(Self::new_with_decoder(reader, decoder, buf, vec![]));
122 }
123
124 let candidates = CharsetDetector::detect_data(&buf)?;
126 let best_candidate = candidates.best();
127
128 if let Some(candidate) = best_candidate {
129 let name = candidate.encoding_name()?;
130 let encoding = crate::encoding::to_standard(name)
132 .or_else(|| encoding_rs::Encoding::for_label(name.as_bytes()));
133 if let Some(enc) = encoding {
134 let decoder = enc.new_decoder_without_bom_handling();
135 return Ok(Self::new_with_decoder(reader, decoder, buf, vec![]));
136 } else {
137 }
139 }
140 if eof {
141 let mut buf_ = Vec::new();
142 for &fallback in fallbacks {
144 let mut reader_ = AutoEncodingReader::new_with_decoder(
145 &*buf,
146 fallback.new_decoder(),
147 vec![],
148 Vec::with_capacity(5 * 512),
149 );
150 buf_.clear();
151 match reader_.read_to_end(&mut buf_) {
152 Ok(_) => {
153 return Ok(AutoEncodingReader::new_with_decoder(
154 reader,
155 reader_.decoder,
156 vec![],
157 buf_,
158 ));
159 }
160 Err(_) => {
161 continue;
162 }
163 }
164 }
165 }
166
167 Err(EncodingError::CharsetError(
169 "未能检测到合适的字符编码,且所有后备编码均失败。".to_owned(),
170 ))
171 }
172
173 #[inline]
177 pub fn new(reader: R) -> Result<Self, EncodingError> {
178 let fallbacks = [encoding_rs::GB18030, encoding_rs::GBK, encoding_rs::BIG5];
179 Self::new_with_fallbacks(reader, &fallbacks)
180 }
181
182 fn copy_from_write_buffer_to(&mut self, buffer: &mut [u8]) -> usize {
184 let min = std::cmp::min(buffer.len(), self.write_buffer.len());
185 buffer[..min].copy_from_slice(&self.write_buffer[..min]);
186 self.write_buffer = self.write_buffer[min..].to_vec();
187 min
188 }
189
190 fn decode(&mut self, buffer: &mut [u8]) -> usize {
194 if self.read_buffer.is_empty() && !self.eof {
195 return 0;
196 }
197
198 if buffer.len() > 1024 {
199 let (coder_result, num_read, num_written, has_replacement) = self
201 .decoder
202 .decode_to_utf8(&self.read_buffer, buffer, self.eof);
203 self.read_buffer = self.read_buffer[num_read..].to_vec();
204 self.had_replacement_or_cant_map |= has_replacement;
205 self.transcode_done =
206 (coder_result == encoding_rs::CoderResult::InputEmpty) && self.eof;
207 return num_written;
208 }
209
210 self.write_buffer.clear();
212 self.write_buffer.resize(8 * 1024, 0);
213 let (coder_result, num_read, num_written, has_replacement) =
214 self.decoder
215 .decode_to_utf8(&self.read_buffer, &mut self.write_buffer, self.eof);
216 self.read_buffer = self.read_buffer[num_read..].to_vec();
217 self.write_buffer.truncate(num_written);
218 self.had_replacement_or_cant_map |= has_replacement;
219 self.transcode_done = (coder_result == encoding_rs::CoderResult::InputEmpty) && self.eof;
220 if num_written > 0 {
221 return self.copy_from_write_buffer_to(buffer);
222 }
223 0
224 }
225
226 pub fn had_replacement_or_cant_map(&self) -> bool {
230 self.had_replacement_or_cant_map
231 }
232}
233
234impl<R: Read> Read for AutoEncodingReader<R> {
235 fn read(&mut self, buffer: &mut [u8]) -> std::io::Result<usize> {
236 if buffer.is_empty() {
237 return Ok(0);
238 }
239
240 if !self.write_buffer.is_empty() {
242 return Ok(self.copy_from_write_buffer_to(buffer));
243 }
244
245 if self.no_transcoding_needed {
247 let n = self.reader.read(buffer)?;
256 return Ok(n);
257 }
258
259 if self.transcode_done {
261 return Ok(0);
262 }
263
264 if !self.read_buffer.is_empty() {
266 let num_written = self.decode(buffer);
267 if num_written > 0 {
268 return Ok(num_written);
269 }
270 }
271
272 let n = self.reader.read(self.buffer.as_mut_slice())?;
274 self.read_buffer.extend_from_slice(&self.buffer[..n]);
275 self.eof = n == 0;
276 let num_written = self.decode(buffer);
277 Ok(num_written)
278 }
279}