rust_detect_encode/
lib.rs

use std::error::Error;
use std::fmt;
use std::fs::File;
use std::io::{self, Read, Write};
use std::path::Path;
use std::process::exit;
use encoding_rs::{Decoder, Encoding, GBK, UTF_8, WINDOWS_1252};

use utils::{BomPeeker, TinyTranscoder};

mod utils;

#[derive(Clone, Debug)]
pub struct DecodeReaderBytesBuilder {
    encoding: Option<&'static Encoding>,
    utf8_passthru: bool,
    bom_override: bool,
    strip_bom: bool,
    bom_sniffing: bool,
}

impl Default for DecodeReaderBytesBuilder {
    fn default() -> DecodeReaderBytesBuilder {
        DecodeReaderBytesBuilder::new()
    }
}

impl DecodeReaderBytesBuilder {
    /// Create a new decoder builder with a default configuration.
    ///
    /// By default, no explicit encoding is used, but if a UTF-8 or UTF-16
    /// BOM is detected, then an appropriate encoding is automatically
    /// detected and transcoding is performed (where invalid sequences map to
    /// the Unicode replacement codepoint).
    pub fn new() -> DecodeReaderBytesBuilder {
        DecodeReaderBytesBuilder {
            encoding: None,
            utf8_passthru: false,
            bom_override: false,
            strip_bom: false,
            bom_sniffing: true,
        }
    }

    /// Build a new decoder that wraps the given reader.
    pub fn build<R: Read>(&self, rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
        self.build_with_buffer(rdr, vec![0; 8 * (1 << 10)]).unwrap()
    }

    /// Build a new decoder that wraps the given reader and uses the given
    /// buffer internally for transcoding.
    ///
    /// This is useful for cases where it is advantageuous to amortize
    /// allocation. Namely, this method permits reusing a buffer for
    /// subsequent decoders.
    ///
    /// This returns an error if the buffer is smaller than 4 bytes (which is
    /// too small to hold maximum size of a single UTF-8 encoded codepoint).
    pub fn build_with_buffer<R: Read, B: AsMut<[u8]>>(
        &self,
        rdr: R,
        mut buffer: B,
    ) -> io::Result<DecodeReaderBytes<R, B>> {
        if buffer.as_mut().len() < 4 {
            let msg = format!(
                "DecodeReaderBytesBuilder: buffer of size {} is too small",
                buffer.as_mut().len(),
            );
            return Err(io::Error::new(io::ErrorKind::Other, msg));
        }
        let encoding =
            self.encoding.map(|enc| enc.new_decoder_with_bom_removal());



        let has_detected =
            !self.bom_sniffing || (!self.bom_override && encoding.is_some());

        let peeker = if self.strip_bom {
            BomPeeker::without_bom(rdr)
        } else {
            BomPeeker::with_bom(rdr)
        };
        Ok(DecodeReaderBytes {
            rdr: peeker,
            decoder: encoding,
            tiny: TinyTranscoder::new(),
            utf8_passthru: self.utf8_passthru,
            buf: buffer,
            buflen: 0,
            pos: 0,
            has_detected: has_detected,
            exhausted: false,
        })
    }

    /// Set an explicit encoding to be used by this decoder.
    ///
    /// When an explicit encoding is set, BOM sniffing is disabled and the
    /// encoding provided will be used unconditionally. Errors in the encoded
    /// bytes are replaced by the Unicode replacement codepoint.
    ///
    /// By default, no explicit encoding is set.
    pub fn encoding(
        &mut self,
        encoding: Option<&'static Encoding>,
    ) -> &mut DecodeReaderBytesBuilder {
        self.encoding = encoding;
        self
    }


    pub fn utf8_passthru(
        &mut self,
        yes: bool,
    ) -> &mut DecodeReaderBytesBuilder {
        self.utf8_passthru = yes;
        self
    }


    pub fn strip_bom(&mut self, yes: bool) -> &mut DecodeReaderBytesBuilder {
        self.strip_bom = yes;
        self
    }

    /// Give the highest precedent to the BOM, if one is found.
    ///
    /// When this is enabled, and if a BOM is found, then the encoding
    /// indicated by that BOM is used even if an explicit encoding has been
    /// set via the `encoding` method.
    ///
    /// This does not override `utf8_passthru`.
    ///
    /// This is disabled by default.
    pub fn bom_override(
        &mut self,
        yes: bool,
    ) -> &mut DecodeReaderBytesBuilder {
        self.bom_override = yes;
        self
    }

    /// Enable BOM sniffing
    ///
    /// When this is enabled and an explicit encoding is not set, the decoder
    /// will try to detect the encoding with BOM.
    ///
    /// When this is disabled and an explicit encoding is not set, the decoder
    /// will treat the input as raw bytes. The bytes will be passed through
    /// unchanged, including any BOM that may be present.
    ///
    /// This is enabled by default.
    pub fn bom_sniffing(
        &mut self,
        yes: bool,
    ) -> &mut DecodeReaderBytesBuilder {
        self.bom_sniffing = yes;
        self
    }
}

/// An implementation of `io::Read` that transcodes to UTF-8 in a streaming
/// fashion.
///
/// The high level goal of this decoder is to provide access to byte streams
/// that are assumed to be UTF-8 unless an encoding is otherwise specified
/// (either via a BOM or via an explicit designation of an encoding).
///
/// When no explicit source encoding is specified (via
/// `DecodeReaderBytesBuilder`), the source encoding is determined by
/// inspecting the BOM from the stream read from `R`, if one exists. If a
/// UTF-16 BOM exists, then the source stream is transcoded to UTF-8 with
/// invalid UTF-16 sequences translated to the Unicode replacement character.
/// Similarly if a UTF-8 BOM is seen. In all other cases, the source of the
/// underlying reader is passed through unchanged _as if_ it were UTF-8.
///
/// Since this particular reader does not guarantee providing valid UTF-8 to
/// the caller, the caller must be prepared to handle invalid UTF-8 itself.
///
/// `R` is the type of the underlying reader and `B` is the type of an internal
/// buffer used to store the results of transcoding. Callers may elect to reuse
/// the internal buffer via the `DecodeReaderBytesBuilder::build_with_buffer`
/// constructor.
pub struct DecodeReaderBytes<R, B> {
    /// The underlying reader, wrapped in a peeker for reading a BOM if one
    /// exists.
    rdr: BomPeeker<R>,
    /// The underlying text decoder derived from the BOM or an explicitly
    /// specified encoding, if one exists.
    decoder: Option<Decoder>,
    /// A "tiny transcoder" for use when a caller provides a buffer that is
    /// too small to write at least one UTF-8 encoded codepoint to.
    tiny: TinyTranscoder,
    /// When enabled, if a UTF-8 BOM is observed, then the bytes are passed
    /// through from the underlying reader as-is instead of passing through
    /// the UTF-8 transcoder (which will replace invalid sequences with the
    /// REPLACEMENT CHARACTER).
    utf8_passthru: bool,
    /// The internal buffer to store transcoded bytes before they are read by
    /// callers.
    buf: B,
    /// The current position in `buf`. Subsequent reads start here.
    pos: usize,
    /// The number of transcoded bytes in `buf`. Subsequent reads end here.
    buflen: usize,
    /// Whether BOM detection has been performed yet or not.
    has_detected: bool,
    /// Whether the underlying reader has been exhausted or not.
    exhausted: bool,
}

impl<R: Read, B: AsMut<[u8]>> Read for DecodeReaderBytes<R, B> {
    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
        self.detect()?;
        if self.decoder.is_none() {
            self.rdr.read(buf)
        } else {
            self.transcode(buf)
        }
    }
}

impl<R: Read> DecodeReaderBytes<R, Vec<u8>> {

    pub fn new(rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
        DecodeReaderBytesBuilder::new().build(rdr)
    }
}
impl<R: Read, B: AsMut<[u8]>> DecodeReaderBytes<R, B> {

    fn transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
        if self.exhausted || buf.is_empty() {
            return Ok(0);
        }
        let nwrite = self.tiny.read(buf)?;
        if nwrite > 0 {
            // We could technically mush on if the caller provided buffer is
            // big enough, but to keep things we simple, we satisfy the
            // contract and quit.
            return Ok(nwrite);
        }
        if self.pos >= self.buflen {
            self.fill()?;
        }
        if buf.len() < 4 {
            return self.tiny_transcode(buf);
        }
        loop {
            let (_, nin, nout, _) =
                self.decoder.as_mut().unwrap().decode_to_utf8(
                    &self.buf.as_mut()[self.pos..self.buflen],
                    buf,
                    false,
                );
            self.pos += nin;
            // If we've written at least one byte to the caller-provided
            // buffer, then our mission is complete.
            if nout > 0 {
                return Ok(nout);
            }
            // Otherwise, we know that our internal buffer has insufficient
            // data to transcode at least one char, so we attempt to refill it.
            self.fill()?;
            // ... but quit on EOF.
            if self.buflen == 0 {
                let (_, _, nout, _) = self
                    .decoder
                    .as_mut()
                    .unwrap()
                    .decode_to_utf8(&[], buf, true);
                return Ok(nout);
            }
        }
    }

    /// Like transcode, but deals with the case where the caller provided
    /// buffer is less than 4.
    fn tiny_transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
        assert!(buf.len() < 4, "have a small caller buffer");
        loop {
            let (nin, nout) = self.tiny.transcode(
                self.decoder.as_mut().unwrap(),
                &self.buf.as_mut()[self.pos..self.buflen],
                false,
            );
            self.pos += nin;
            if nout > 0 {
                // We've satisfied the contract of writing at least one byte,
                // so we're done. The tiny transcoder is guaranteed to yield
                // a non-zero number of bytes.
                return self.tiny.read(buf);
            }
            // Otherwise, we know that our internal buffer has insufficient
            // data to transcode at least one char, so we attempt to refill it.
            self.fill()?;
            // ... but quit on EOF.
            if self.buflen == 0 {
                self.tiny.transcode(self.decoder.as_mut().unwrap(), &[], true);
                return self.tiny.read(buf);
            }
        }
    }

    /// Peeks at the underlying reader to look for a BOM. If one exists, then
    /// an appropriate decoder is created corresponding to the detected BOM.
    fn detect(&mut self) -> io::Result<()> {
        if self.has_detected {
            return Ok(());
        }
        self.has_detected = true;
        let bom = self.rdr.peek_bom()?;
        if let Some(encoding) = bom.encoding() {
            // If we got a UTF-8 BOM, and the decoder was configured for
            // passing through UTF-8, then don't build a decoder at all.
            if encoding == UTF_8 && self.utf8_passthru {
                return Ok(());
            }
            self.decoder = Some(encoding.new_decoder_with_bom_removal());
        }
        Ok(())
    }

    /// Fill the internal buffer from the underlying reader.
    ///
    /// If there are unread bytes in the internal buffer, then we move them
    /// to the beginning of the internal buffer and fill the remainder.
    ///
    /// If the internal buffer is too small to read additional bytes, then an
    /// error is returned.
    fn fill(&mut self) -> io::Result<()> {
        if self.pos < self.buflen {
            // Despite my best efforts, I could not seem to actually exercise
            // this code path in tests. Namely, this code path occurs when the
            // decoder can't make any progress and also doesn't consume all of
            // the input. Since I'm not sure how to trigger that case, this
            // code path is actually untested!

            // We can assert this because we require that the caller provided
            // buffer be at least 4 bytes big.
            assert!(
                self.buflen < self.buf.as_mut().len(),
                "internal buffer should never be exhausted"
            );
            let buf = self.buf.as_mut();
            for (dst, src) in (self.pos..self.buflen).enumerate() {
                buf[dst] = buf[src];
            }
            self.buflen -= self.pos;
        } else {
            self.buflen = 0;
        }
        self.pos = 0;
        self.buflen += self.rdr.read(&mut self.buf.as_mut()[self.buflen..])?;
        if self.buflen == 0 {
            self.exhausted = true;
        }
        Ok(())
    }
}

impl<R: fmt::Debug, B: fmt::Debug> fmt::Debug for DecodeReaderBytes<R, B> {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        let mut fmter = f.debug_struct("DecodeReaderBytes");
        fmter
            .field("rdr", &self.rdr)
            .field("tiny", &self.tiny)
            .field("utf8_passthru", &self.utf8_passthru)
            .field("buf", &self.buf)
            .field("pos", &self.pos)
            .field("buflen", &self.buflen)
            .field("has_detected", &self.has_detected)
            .field("exhausted", &self.exhausted);
        // Because `encoding_rs::Decoder` doesn't impl `fmt::Debug`.
        if let Some(ref d) = self.decoder {
            let msg = format!("Some(<Decoder for {}>)", d.encoding().name());
            fmter.field("decoder", &msg);
        } else {
            fmter.field("decoder", &"None");
        }
        fmter.finish()
    }
}
fn remove_bom_and_convert_to_utf8 <P: AsRef<Path>> (dest: &mut String, filepath: P ) {
    let mut file = File::create(filepath).unwrap();
    let buffer  = dest.as_bytes() ;
    file.write_all(&buffer).unwrap();
    file.flush().unwrap();
}

fn detect_gbk_encoding_and_transform_utf8<P: AsRef<Path> + Copy>(file_path:P ) -> io::Result<String> {
    let mut file = File::open(file_path)?;
    let mut buffer = Vec::new();
    file.read_to_end(&mut buffer)?;

    // 尝试使用 GBK 解码
    let (encoding, _, had_errors) = GBK.decode(&buffer);
    if had_errors {
        eprintln!("Warning: Some errors occurred during decoding.");
    }
    if !encoding.is_empty() {
        let utf8_content = encoding.to_string();

        let result = utf8_content;
        // 写入文件
        let mut output_file = File::create(file_path)?;
        output_file.write_all(result.as_bytes())?;

        return Ok(result.to_string());
    }

    // 如果 GBK 检测失败，继续检测 GBK2312
    let (encoding, _, _) = WINDOWS_1252.decode(&buffer);
    if !encoding.is_empty() {
        let utf8_content = encoding.to_string();
        let mut output_file = File::create(file_path)?;
        output_file.write_all(utf8_content.as_bytes())?;

        let result = utf8_content;
        return Ok(result.to_string());
    }

    Ok("Unknown encoding".to_string())
}

pub fn  translate_all_encoded_mode_file_to_utf8<P: AsRef<Path> + Copy>(file_path :P ) -> Result<(), Box<dyn Error>> {
    let file = std::fs::File::open(&file_path)?;
    let source_data = std::io::BufReader::new(file);
    let mut decoder = DecodeReaderBytes::new(source_data);

    let mut dest = String::new();
    let result = decoder.read_to_string(&mut dest);
    if result.is_err() {
        eprintln!("Error decoding file");
        dest = detect_gbk_encoding_and_transform_utf8(file_path)?;
    } else {
        remove_bom_and_convert_to_utf8(&mut dest, file_path);
    }

    Ok(())

}
rust_detect_encode/lib.rs

rust_detect_encode/
lib.rs