Skip to main content

uchardet_git/
auto_encoding_reader.rs

1// MIT License
2//
3// Copyright (c) 2026 worksoup <https://github.com/worksoup/>
4//
5// Permission is hereby granted, free of charge, to any person obtaining a copy
6// of this software and associated documentation files (the "Software"), to deal
7// in the Software without restriction, including without limitation the rights
8// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9// copies of the Software, and to permit persons to whom the Software is
10// furnished to do so, subject to the following conditions:
11//
12// The above copyright notice and this permission notice shall be included in all
13// copies or substantial portions of the Software.
14//
15// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21// SOFTWARE.
22
23use crate::{CharsetDetector, Error as DetectorError};
24use encoding_rs::Decoder;
25use reader_ext::Rewind;
26use std::io::{Read, Seek};
27
28/// 自动编码检测和转换读取器的错误类型
29#[derive(Debug, thiserror::Error)]
30pub enum EncodingError {
31    /// 底层IO操作错误
32    #[error(transparent)]
33    IoError(#[from] std::io::Error),
34
35    /// 字符编码检测或转换错误
36    #[error("字符编码错误:{0}")]
37    CharsetError(String),
38
39    /// uchardet 库错误
40    #[error(transparent)]
41    DetectorError(#[from] DetectorError),
42}
43
44/// 自动检测和转换文本编码的读取器
45///
46/// 该读取器使用 `uchardet` 自动检测输入流的字符编码,并将其实时转换为 UTF-8 编码。
47/// 支持多种常见编码格式,包括 GB18030、GBK、BIG5 等。
48pub struct AutoEncodingReader<R: Read> {
49    /// 底层原始读取器
50    reader: R,
51    /// 从底层读取器读取数据的可变大小的读取缓冲区(用于检测和后续读取)
52    buffer: Box<[u8]>,
53    /// 已读取但尚未解码的原始字节缓冲区(包括用于检测的部分)
54    read_buffer: Vec<u8>,
55    /// 已解码为 UTF-8 的输出缓冲区
56    write_buffer: Vec<u8>,
57    /// uchardet 返回的原始编码名称
58    encoding_name: Option<String>,
59    /// 解码器实例,用于将原始编码转换为 UTF-8
60    decoder: Decoder,
61    /// 标记解码过程中是否出现无法映射的字符(使用了替换字符)
62    had_replacement_or_cant_map: bool,
63    /// 标记转码是否已完成(所有输入已处理)
64    transcode_done: bool,
65    /// 标记是否已到达输入流的末尾
66    eof: bool,
67    /// 标记是否无需转码(输入已经是 UTF-8 编码)
68    no_transcoding_needed: bool,
69}
70
71impl<R: Read + Seek> Rewind for AutoEncodingReader<R> {
72    fn try_rewind(&mut self) -> std::io::Result<()> {
73        self.reader.rewind()?;
74        self.read_buffer.clear();
75        self.write_buffer.clear();
76        // 重置解码器为初始状态
77        self.decoder = self.decoder.encoding().new_decoder();
78        self.had_replacement_or_cant_map = false;
79        self.transcode_done = false;
80        self.eof = false;
81        Ok(())
82    }
83}
84
85impl<R: Read> AutoEncodingReader<R> {
86    /// 内部构造:使用已知解码器和初始数据创建读取器
87    pub(crate) fn new_with_decoder(
88        reader: R,
89        encoding_name: Option<String>,
90        decoder: Decoder,
91        initial_data: Vec<u8>,
92        decoded_data: Vec<u8>,
93        read_buffer_size: usize,
94    ) -> Self {
95        let no_transcoding_needed = decoder.encoding().name().eq_ignore_ascii_case("UTF-8");
96        let (mut initial_data, mut decoded_data) = (initial_data, decoded_data);
97        if no_transcoding_needed {
98            // 跳过 BOM
99            if initial_data
100                .windows(3)
101                .next()
102                .is_some_and(|maybe_bom| maybe_bom == b"\xef\xbb\xbf")
103            {
104                initial_data.drain(..3);
105            }
106            if decoded_data.is_empty() {
107                (initial_data, decoded_data) = (decoded_data, initial_data);
108            } else {
109                decoded_data.append(&mut initial_data);
110            }
111        }
112        // 分配指定大小的读取缓冲区
113        let buffer = vec![0u8; read_buffer_size].into_boxed_slice();
114        Self {
115            reader,
116            encoding_name,
117            buffer,
118            read_buffer: initial_data,
119            write_buffer: decoded_data,
120            decoder,
121            had_replacement_or_cant_map: false,
122            transcode_done: false,
123            eof: false,
124            no_transcoding_needed,
125        }
126    }
127
128    /// 使用默认检测缓冲区大小 (8KB) 和读取缓冲区大小 (8KB)、默认后备编码列表、默认语言权重("zh": 2.0)创建新的读取器
129    ///
130    /// 默认后备编码:GB18030, GBK, BIG5(针对中文环境)
131    #[inline]
132    pub fn new(reader: R) -> Result<Self, EncodingError> {
133        AutoEncodingReaderBuilder::with_reader(reader)
134            .fallbacks(&[encoding_rs::GB18030, encoding_rs::GBK, encoding_rs::BIG5])
135            .language_weight("zh", 2.0)
136            .build()
137    }
138
139    /// 从输出缓冲区复制数据到用户提供的缓冲区
140    fn copy_from_write_buffer_to(&mut self, buffer: &mut [u8]) -> usize {
141        let min = std::cmp::min(buffer.len(), self.write_buffer.len());
142        buffer[..min].copy_from_slice(&self.write_buffer[..min]);
143        self.write_buffer = self.write_buffer[min..].to_vec();
144        min
145    }
146
147    /// 解码原始字节为 UTF-8
148    ///
149    /// 将 `read_buffer` 中的原始字节解码为 UTF-8,写入到用户缓冲区或内部缓冲区
150    fn decode(&mut self, buffer: &mut [u8]) -> usize {
151        if self.read_buffer.is_empty() && !self.eof {
152            return 0;
153        }
154
155        if buffer.len() > 1024 {
156            // 用户缓冲区足够大,直接解码到其中
157            let (coder_result, num_read, num_written, has_replacement) = self
158                .decoder
159                .decode_to_utf8(&self.read_buffer, buffer, self.eof);
160            self.read_buffer = self.read_buffer[num_read..].to_vec();
161            self.had_replacement_or_cant_map |= has_replacement;
162            self.transcode_done =
163                (coder_result == encoding_rs::CoderResult::InputEmpty) && self.eof;
164            return num_written;
165        }
166
167        // 用户缓冲区太小,解码到内部缓冲区
168        self.write_buffer.clear();
169        self.write_buffer.resize(8 * 1024, 0);
170        let (coder_result, num_read, num_written, has_replacement) =
171            self.decoder
172                .decode_to_utf8(&self.read_buffer, &mut self.write_buffer, self.eof);
173        self.read_buffer = self.read_buffer[num_read..].to_vec();
174        self.write_buffer.truncate(num_written);
175        self.had_replacement_or_cant_map |= has_replacement;
176        self.transcode_done = (coder_result == encoding_rs::CoderResult::InputEmpty) && self.eof;
177        if num_written > 0 {
178            return self.copy_from_write_buffer_to(buffer);
179        }
180        0
181    }
182
183    /// 检查解码过程中是否出现了无法映射的字符
184    ///
185    /// 返回 `true` 表示解码过程中使用了替换字符(通常为 �)
186    pub fn had_replacement_or_cant_map(&self) -> bool {
187        self.had_replacement_or_cant_map
188    }
189
190    /// 返回当前使用的解码器
191    pub fn decoder(&self) -> &Decoder {
192        &self.decoder
193    }
194
195    /// uchardet 给出的原始编码名称
196    pub fn encoding_name(&self) -> &Option<String> {
197        &self.encoding_name
198    }
199}
200
201impl<R: Read> Read for AutoEncodingReader<R> {
202    fn read(&mut self, buffer: &mut [u8]) -> std::io::Result<usize> {
203        if buffer.is_empty() {
204            return Ok(0);
205        }
206
207        // 优先从输出缓冲区取数据
208        if !self.write_buffer.is_empty() {
209            return Ok(self.copy_from_write_buffer_to(buffer));
210        }
211
212        // 如果无需转码(已经是 UTF-8),直接传递数据
213        if self.no_transcoding_needed {
214            // 如果 read_buffer 还有未读取的数据(例如 BOM 之后的),应先处理;
215            // 但实际上此时永远为空(构造时已直接写入 write_buffer, 后续也不再写入 read_buffer)。
216            // if !self.read_buffer.is_empty() {
217            //     let n = std::cmp::min(buffer.len(), self.read_buffer.len());
218            //     buffer[..n].copy_from_slice(&self.read_buffer[..n]);
219            //     self.read_buffer = self.read_buffer[n..].to_vec();
220            //     return Ok(n);
221            // }
222            let n = self.reader.read(buffer)?;
223            return Ok(n);
224        }
225
226        // 如果转码已完成,返回 0 表示 EOF
227        if self.transcode_done {
228            return Ok(0);
229        }
230
231        // 如果 read_buffer 有数据,尝试解码
232        if !self.read_buffer.is_empty() {
233            let num_written = self.decode(buffer);
234            if num_written > 0 {
235                return Ok(num_written);
236            }
237        }
238
239        // 从底层读取器读取更多数据
240        let n = self.reader.read(self.buffer.as_mut())?;
241        self.read_buffer.extend_from_slice(&self.buffer[..n]);
242        self.eof = n == 0;
243        let num_written = self.decode(buffer);
244        Ok(num_written)
245    }
246}
247
248/// 构建器,用于配置 AutoEncodingReader
249pub struct AutoEncodingReaderBuilder<R> {
250    reader: R,
251    fallbacks: Vec<&'static encoding_rs::Encoding>,
252    detect_buffer_size: usize,
253    read_buffer_size: usize,
254    language_weights: Vec<(String, f32)>,
255    default_weight: Option<f32>,
256}
257
258impl<R: Read> AutoEncodingReaderBuilder<R> {
259    /// 创建新的构建器
260    pub fn with_reader(reader: R) -> Self {
261        Self {
262            reader,
263            fallbacks: Vec::new(),
264            detect_buffer_size: 8192,
265            read_buffer_size: 8192,
266            language_weights: Vec::new(),
267            default_weight: None,
268        }
269    }
270
271    /// 设置后备编码列表
272    pub fn fallbacks(mut self, fallbacks: &[&'static encoding_rs::Encoding]) -> Self {
273        self.fallbacks = fallbacks.to_vec();
274        self
275    }
276
277    /// 设置检测缓冲区大小(字节数)
278    pub fn detect_buffer_size(mut self, size: usize) -> Self {
279        self.detect_buffer_size = size;
280        self
281    }
282
283    /// 设置后续读取缓冲区大小(字节数)
284    pub fn read_buffer_size(mut self, size: usize) -> Self {
285        self.read_buffer_size = size;
286        self
287    }
288
289    /// 添加语言权重(可多次调用)
290    pub fn language_weight(mut self, language: &str, weight: f32) -> Self {
291        self.language_weights.push((language.to_owned(), weight));
292        self
293    }
294
295    /// 设置默认权重
296    pub fn default_weight(mut self, weight: f32) -> Self {
297        self.default_weight = Some(weight);
298        self
299    }
300
301    /// 构建 AutoEncodingReader
302    pub fn build(self) -> Result<AutoEncodingReader<R>, EncodingError> {
303        let mut reader = self.reader;
304
305        // 读取 detect_buffer_size 字节用于检测
306        let mut buf = vec![0u8; self.detect_buffer_size];
307        let n = reader.read(&mut buf)?;
308        let eof = n < buf.len();
309        buf.truncate(n);
310
311        if n == 0 {
312            // 空文件,直接返回 UTF-8 解码器
313            let decoder = encoding_rs::UTF_8.new_decoder_without_bom_handling();
314            return Ok(AutoEncodingReader::new_with_decoder(
315                reader,
316                Some("UTF-8".to_owned()),
317                decoder,
318                buf,
319                vec![],
320                self.read_buffer_size,
321            ));
322        }
323
324        // 使用 uchardet 检测编码
325        let mut detector = CharsetDetector::new();
326        for (lang, weight) in &self.language_weights {
327            detector.weigh_language(lang, *weight)?;
328        }
329        if let Some(w) = self.default_weight {
330            detector.set_default_weight(w);
331        }
332
333        detector.feed_data(&buf)?;
334        let candidates = detector.detect();
335        let best_candidate = candidates.best();
336
337        if let Some(candidate) = best_candidate {
338            let name = candidate.encoding_name()?;
339            let encoding = crate::encoding::as_whatwg(name)
340                .or_else(|| encoding_rs::Encoding::for_label(name.as_bytes()));
341            if let Some(enc) = encoding {
342                let decoder = enc.new_decoder();
343                return Ok(AutoEncodingReader::new_with_decoder(
344                    reader,
345                    Some(name.to_owned()),
346                    decoder,
347                    buf,
348                    vec![],
349                    self.read_buffer_size,
350                ));
351            }
352        }
353
354        // 检测失败或编码名不支持,尝试后备编码
355        if eof {
356            let mut decoded = Vec::new();
357            for &fallback in &self.fallbacks {
358                let mut tmp_reader = AutoEncodingReader::new_with_decoder(
359                    &*buf,
360                    None,
361                    fallback.new_decoder(),
362                    vec![],
363                    Vec::with_capacity(5 * 512),
364                    self.read_buffer_size,
365                );
366                decoded.clear();
367                if tmp_reader.read_to_end(&mut decoded).is_ok() {
368                    return Ok(AutoEncodingReader::new_with_decoder(
369                        reader,
370                        None,
371                        fallback.new_decoder(),
372                        vec![],
373                        decoded,
374                        self.read_buffer_size,
375                    ));
376                }
377            }
378        }
379
380        Err(EncodingError::CharsetError(
381            "未能检测到合适的字符编码,且所有后备编码均失败。".to_owned(),
382        ))
383    }
384}