rust_file_encode_mode_convert/
lib.rs

1use std::error::Error;
2use std::fmt;
3use std::fs::File;
4use std::io::{self, Read, Write};
5use std::path::Path;
6use std::process::exit;
7use encoding_rs::{Decoder, Encoding, GBK, UTF_8, WINDOWS_1252};
8
9use utils::{BomPeeker, TinyTranscoder};
10
11mod utils;
12
13#[derive(Clone, Debug)]
14pub struct DecodeReaderBytesBuilder {
15    encoding: Option<&'static Encoding>,
16    utf8_passthru: bool,
17    bom_override: bool,
18    strip_bom: bool,
19    bom_sniffing: bool,
20}
21
22impl Default for DecodeReaderBytesBuilder {
23    fn default() -> DecodeReaderBytesBuilder {
24        DecodeReaderBytesBuilder::new()
25    }
26}
27
28impl DecodeReaderBytesBuilder {
29    /// Create a new decoder builder with a default configuration.
30    ///
31    /// By default, no explicit encoding is used, but if a UTF-8 or UTF-16
32    /// BOM is detected, then an appropriate encoding is automatically
33    /// detected and transcoding is performed (where invalid sequences map to
34    /// the Unicode replacement codepoint).
35    pub fn new() -> DecodeReaderBytesBuilder {
36        DecodeReaderBytesBuilder {
37            encoding: None,
38            utf8_passthru: false,
39            bom_override: false,
40            strip_bom: false,
41            bom_sniffing: true,
42        }
43    }
44
45    /// Build a new decoder that wraps the given reader.
46    pub fn build<R: Read>(&self, rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
47        self.build_with_buffer(rdr, vec![0; 8 * (1 << 10)]).unwrap()
48    }
49
50    /// Build a new decoder that wraps the given reader and uses the given
51    /// buffer internally for transcoding.
52    ///
53    /// This is useful for cases where it is advantageuous to amortize
54    /// allocation. Namely, this method permits reusing a buffer for
55    /// subsequent decoders.
56    ///
57    /// This returns an error if the buffer is smaller than 4 bytes (which is
58    /// too small to hold maximum size of a single UTF-8 encoded codepoint).
59    pub fn build_with_buffer<R: Read, B: AsMut<[u8]>>(
60        &self,
61        rdr: R,
62        mut buffer: B,
63    ) -> io::Result<DecodeReaderBytes<R, B>> {
64        if buffer.as_mut().len() < 4 {
65            let msg = format!(
66                "DecodeReaderBytesBuilder: buffer of size {} is too small",
67                buffer.as_mut().len(),
68            );
69            return Err(io::Error::new(io::ErrorKind::Other, msg));
70        }
71        let encoding =
72            self.encoding.map(|enc| enc.new_decoder_with_bom_removal());
73
74
75
76        let has_detected =
77            !self.bom_sniffing || (!self.bom_override && encoding.is_some());
78
79        let peeker = if self.strip_bom {
80            BomPeeker::without_bom(rdr)
81        } else {
82            BomPeeker::with_bom(rdr)
83        };
84        Ok(DecodeReaderBytes {
85            rdr: peeker,
86            decoder: encoding,
87            tiny: TinyTranscoder::new(),
88            utf8_passthru: self.utf8_passthru,
89            buf: buffer,
90            buflen: 0,
91            pos: 0,
92            has_detected: has_detected,
93            exhausted: false,
94        })
95    }
96
97    /// Set an explicit encoding to be used by this decoder.
98    ///
99    /// When an explicit encoding is set, BOM sniffing is disabled and the
100    /// encoding provided will be used unconditionally. Errors in the encoded
101    /// bytes are replaced by the Unicode replacement codepoint.
102    ///
103    /// By default, no explicit encoding is set.
104    pub fn encoding(
105        &mut self,
106        encoding: Option<&'static Encoding>,
107    ) -> &mut DecodeReaderBytesBuilder {
108        self.encoding = encoding;
109        self
110    }
111
112
113    pub fn utf8_passthru(
114        &mut self,
115        yes: bool,
116    ) -> &mut DecodeReaderBytesBuilder {
117        self.utf8_passthru = yes;
118        self
119    }
120
121
122    pub fn strip_bom(&mut self, yes: bool) -> &mut DecodeReaderBytesBuilder {
123        self.strip_bom = yes;
124        self
125    }
126
127    /// Give the highest precedent to the BOM, if one is found.
128    ///
129    /// When this is enabled, and if a BOM is found, then the encoding
130    /// indicated by that BOM is used even if an explicit encoding has been
131    /// set via the `encoding` method.
132    ///
133    /// This does not override `utf8_passthru`.
134    ///
135    /// This is disabled by default.
136    pub fn bom_override(
137        &mut self,
138        yes: bool,
139    ) -> &mut DecodeReaderBytesBuilder {
140        self.bom_override = yes;
141        self
142    }
143
144    /// Enable BOM sniffing
145    ///
146    /// When this is enabled and an explicit encoding is not set, the decoder
147    /// will try to detect the encoding with BOM.
148    ///
149    /// When this is disabled and an explicit encoding is not set, the decoder
150    /// will treat the input as raw bytes. The bytes will be passed through
151    /// unchanged, including any BOM that may be present.
152    ///
153    /// This is enabled by default.
154    pub fn bom_sniffing(
155        &mut self,
156        yes: bool,
157    ) -> &mut DecodeReaderBytesBuilder {
158        self.bom_sniffing = yes;
159        self
160    }
161}
162
163/// An implementation of `io::Read` that transcodes to UTF-8 in a streaming
164/// fashion.
165///
166/// The high level goal of this decoder is to provide access to byte streams
167/// that are assumed to be UTF-8 unless an encoding is otherwise specified
168/// (either via a BOM or via an explicit designation of an encoding).
169///
170/// When no explicit source encoding is specified (via
171/// `DecodeReaderBytesBuilder`), the source encoding is determined by
172/// inspecting the BOM from the stream read from `R`, if one exists. If a
173/// UTF-16 BOM exists, then the source stream is transcoded to UTF-8 with
174/// invalid UTF-16 sequences translated to the Unicode replacement character.
175/// Similarly if a UTF-8 BOM is seen. In all other cases, the source of the
176/// underlying reader is passed through unchanged _as if_ it were UTF-8.
177///
178/// Since this particular reader does not guarantee providing valid UTF-8 to
179/// the caller, the caller must be prepared to handle invalid UTF-8 itself.
180///
181/// `R` is the type of the underlying reader and `B` is the type of an internal
182/// buffer used to store the results of transcoding. Callers may elect to reuse
183/// the internal buffer via the `DecodeReaderBytesBuilder::build_with_buffer`
184/// constructor.
185pub struct DecodeReaderBytes<R, B> {
186    /// The underlying reader, wrapped in a peeker for reading a BOM if one
187    /// exists.
188    rdr: BomPeeker<R>,
189    /// The underlying text decoder derived from the BOM or an explicitly
190    /// specified encoding, if one exists.
191    decoder: Option<Decoder>,
192    /// A "tiny transcoder" for use when a caller provides a buffer that is
193    /// too small to write at least one UTF-8 encoded codepoint to.
194    tiny: TinyTranscoder,
195    /// When enabled, if a UTF-8 BOM is observed, then the bytes are passed
196    /// through from the underlying reader as-is instead of passing through
197    /// the UTF-8 transcoder (which will replace invalid sequences with the
198    /// REPLACEMENT CHARACTER).
199    utf8_passthru: bool,
200    /// The internal buffer to store transcoded bytes before they are read by
201    /// callers.
202    buf: B,
203    /// The current position in `buf`. Subsequent reads start here.
204    pos: usize,
205    /// The number of transcoded bytes in `buf`. Subsequent reads end here.
206    buflen: usize,
207    /// Whether BOM detection has been performed yet or not.
208    has_detected: bool,
209    /// Whether the underlying reader has been exhausted or not.
210    exhausted: bool,
211}
212
213impl<R: Read, B: AsMut<[u8]>> Read for DecodeReaderBytes<R, B> {
214    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
215        self.detect()?;
216        if self.decoder.is_none() {
217            self.rdr.read(buf)
218        } else {
219            self.transcode(buf)
220        }
221    }
222}
223
224impl<R: Read> DecodeReaderBytes<R, Vec<u8>> {
225
226    pub fn new(rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
227        DecodeReaderBytesBuilder::new().build(rdr)
228    }
229}
230impl<R: Read, B: AsMut<[u8]>> DecodeReaderBytes<R, B> {
231
232    fn transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
233        if self.exhausted || buf.is_empty() {
234            return Ok(0);
235        }
236        let nwrite = self.tiny.read(buf)?;
237        if nwrite > 0 {
238            // We could technically mush on if the caller provided buffer is
239            // big enough, but to keep things we simple, we satisfy the
240            // contract and quit.
241            return Ok(nwrite);
242        }
243        if self.pos >= self.buflen {
244            self.fill()?;
245        }
246        if buf.len() < 4 {
247            return self.tiny_transcode(buf);
248        }
249        loop {
250            let (_, nin, nout, _) =
251                self.decoder.as_mut().unwrap().decode_to_utf8(
252                    &self.buf.as_mut()[self.pos..self.buflen],
253                    buf,
254                    false,
255                );
256            self.pos += nin;
257            // If we've written at least one byte to the caller-provided
258            // buffer, then our mission is complete.
259            if nout > 0 {
260                return Ok(nout);
261            }
262            // Otherwise, we know that our internal buffer has insufficient
263            // data to transcode at least one char, so we attempt to refill it.
264            self.fill()?;
265            // ... but quit on EOF.
266            if self.buflen == 0 {
267                let (_, _, nout, _) = self
268                    .decoder
269                    .as_mut()
270                    .unwrap()
271                    .decode_to_utf8(&[], buf, true);
272                return Ok(nout);
273            }
274        }
275    }
276
277    /// Like transcode, but deals with the case where the caller provided
278    /// buffer is less than 4.
279    fn tiny_transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
280        assert!(buf.len() < 4, "have a small caller buffer");
281        loop {
282            let (nin, nout) = self.tiny.transcode(
283                self.decoder.as_mut().unwrap(),
284                &self.buf.as_mut()[self.pos..self.buflen],
285                false,
286            );
287            self.pos += nin;
288            if nout > 0 {
289                // We've satisfied the contract of writing at least one byte,
290                // so we're done. The tiny transcoder is guaranteed to yield
291                // a non-zero number of bytes.
292                return self.tiny.read(buf);
293            }
294            // Otherwise, we know that our internal buffer has insufficient
295            // data to transcode at least one char, so we attempt to refill it.
296            self.fill()?;
297            // ... but quit on EOF.
298            if self.buflen == 0 {
299                self.tiny.transcode(self.decoder.as_mut().unwrap(), &[], true);
300                return self.tiny.read(buf);
301            }
302        }
303    }
304
305    /// Peeks at the underlying reader to look for a BOM. If one exists, then
306    /// an appropriate decoder is created corresponding to the detected BOM.
307    fn detect(&mut self) -> io::Result<()> {
308        if self.has_detected {
309            return Ok(());
310        }
311        self.has_detected = true;
312        let bom = self.rdr.peek_bom()?;
313        if let Some(encoding) = bom.encoding() {
314            // If we got a UTF-8 BOM, and the decoder was configured for
315            // passing through UTF-8, then don't build a decoder at all.
316            if encoding == UTF_8 && self.utf8_passthru {
317                return Ok(());
318            }
319            self.decoder = Some(encoding.new_decoder_with_bom_removal());
320        }
321        Ok(())
322    }
323
324    /// Fill the internal buffer from the underlying reader.
325    ///
326    /// If there are unread bytes in the internal buffer, then we move them
327    /// to the beginning of the internal buffer and fill the remainder.
328    ///
329    /// If the internal buffer is too small to read additional bytes, then an
330    /// error is returned.
331    fn fill(&mut self) -> io::Result<()> {
332        if self.pos < self.buflen {
333            // Despite my best efforts, I could not seem to actually exercise
334            // this code path in tests. Namely, this code path occurs when the
335            // decoder can't make any progress and also doesn't consume all of
336            // the input. Since I'm not sure how to trigger that case, this
337            // code path is actually untested!
338
339            // We can assert this because we require that the caller provided
340            // buffer be at least 4 bytes big.
341            assert!(
342                self.buflen < self.buf.as_mut().len(),
343                "internal buffer should never be exhausted"
344            );
345            let buf = self.buf.as_mut();
346            for (dst, src) in (self.pos..self.buflen).enumerate() {
347                buf[dst] = buf[src];
348            }
349            self.buflen -= self.pos;
350        } else {
351            self.buflen = 0;
352        }
353        self.pos = 0;
354        self.buflen += self.rdr.read(&mut self.buf.as_mut()[self.buflen..])?;
355        if self.buflen == 0 {
356            self.exhausted = true;
357        }
358        Ok(())
359    }
360}
361
362impl<R: fmt::Debug, B: fmt::Debug> fmt::Debug for DecodeReaderBytes<R, B> {
363    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
364        let mut fmter = f.debug_struct("DecodeReaderBytes");
365        fmter
366            .field("rdr", &self.rdr)
367            .field("tiny", &self.tiny)
368            .field("utf8_passthru", &self.utf8_passthru)
369            .field("buf", &self.buf)
370            .field("pos", &self.pos)
371            .field("buflen", &self.buflen)
372            .field("has_detected", &self.has_detected)
373            .field("exhausted", &self.exhausted);
374        // Because `encoding_rs::Decoder` doesn't impl `fmt::Debug`.
375        if let Some(ref d) = self.decoder {
376            let msg = format!("Some(<Decoder for {}>)", d.encoding().name());
377            fmter.field("decoder", &msg);
378        } else {
379            fmter.field("decoder", &"None");
380        }
381        fmter.finish()
382    }
383}
384fn remove_bom_and_convert_to_utf8 <P: AsRef<Path>> (dest: &mut String, filepath: P ) {
385    let mut file = File::create(filepath).unwrap();
386    let buffer  = dest.as_bytes() ;
387    file.write_all(&buffer).unwrap();
388    file.flush().unwrap();
389}
390
391fn detect_gbk_encoding_and_transform_utf8<P: AsRef<Path> + Copy>(file_path:P ) -> io::Result<String> {
392    let mut file = File::open(file_path)?;
393    let mut buffer = Vec::new();
394    file.read_to_end(&mut buffer)?;
395
396    // 尝试使用 GBK 解码
397    let (encoding, _, had_errors) = GBK.decode(&buffer);
398    if had_errors {
399        eprintln!("Warning: Some errors occurred during decoding.");
400    }
401    if !encoding.is_empty() {
402        let utf8_content = encoding.to_string();
403
404        let result = utf8_content;
405        // 写入文件
406        let mut output_file = File::create(file_path)?;
407        output_file.write_all(result.as_bytes())?;
408
409        return Ok(result.to_string());
410    }
411
412    // 如果 GBK 检测失败,继续检测 GBK2312
413    let (encoding, _, _) = WINDOWS_1252.decode(&buffer);
414    if !encoding.is_empty() {
415        let utf8_content = encoding.to_string();
416        let mut output_file = File::create(file_path)?;
417        output_file.write_all(utf8_content.as_bytes())?;
418
419        let result = utf8_content;
420        return Ok(result.to_string());
421    }
422
423    Ok("Unknown encoding".to_string())
424}
425
426pub fn  translate_all_encoded_mode_file_to_utf8<P: AsRef<Path> + Copy>(file_path :P ) -> Result<(), Box<dyn Error>> {
427    let file = std::fs::File::open(&file_path)?;
428    let source_data = std::io::BufReader::new(file);
429    let mut decoder = DecodeReaderBytes::new(source_data);
430
431    let mut dest = String::new();
432    let result = decoder.read_to_string(&mut dest);
433    if result.is_err() {
434        eprintln!("Error decoding file");
435        dest = detect_gbk_encoding_and_transform_utf8(file_path)?;
436    } else {
437        remove_bom_and_convert_to_utf8(&mut dest, file_path);
438    }
439
440    Ok(())
441
442}
443