rust_file_encode_mode_convert/lib.rs
1use std::error::Error;
2use std::fmt;
3use std::fs::File;
4use std::io::{self, Read, Write};
5use std::path::Path;
6use std::process::exit;
7use encoding_rs::{Decoder, Encoding, GBK, UTF_8, WINDOWS_1252};
8
9use utils::{BomPeeker, TinyTranscoder};
10
11mod utils;
12
13#[derive(Clone, Debug)]
14pub struct DecodeReaderBytesBuilder {
15 encoding: Option<&'static Encoding>,
16 utf8_passthru: bool,
17 bom_override: bool,
18 strip_bom: bool,
19 bom_sniffing: bool,
20}
21
22impl Default for DecodeReaderBytesBuilder {
23 fn default() -> DecodeReaderBytesBuilder {
24 DecodeReaderBytesBuilder::new()
25 }
26}
27
28impl DecodeReaderBytesBuilder {
29 /// Create a new decoder builder with a default configuration.
30 ///
31 /// By default, no explicit encoding is used, but if a UTF-8 or UTF-16
32 /// BOM is detected, then an appropriate encoding is automatically
33 /// detected and transcoding is performed (where invalid sequences map to
34 /// the Unicode replacement codepoint).
35 pub fn new() -> DecodeReaderBytesBuilder {
36 DecodeReaderBytesBuilder {
37 encoding: None,
38 utf8_passthru: false,
39 bom_override: false,
40 strip_bom: false,
41 bom_sniffing: true,
42 }
43 }
44
45 /// Build a new decoder that wraps the given reader.
46 pub fn build<R: Read>(&self, rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
47 self.build_with_buffer(rdr, vec![0; 8 * (1 << 10)]).unwrap()
48 }
49
50 /// Build a new decoder that wraps the given reader and uses the given
51 /// buffer internally for transcoding.
52 ///
53 /// This is useful for cases where it is advantageuous to amortize
54 /// allocation. Namely, this method permits reusing a buffer for
55 /// subsequent decoders.
56 ///
57 /// This returns an error if the buffer is smaller than 4 bytes (which is
58 /// too small to hold maximum size of a single UTF-8 encoded codepoint).
59 pub fn build_with_buffer<R: Read, B: AsMut<[u8]>>(
60 &self,
61 rdr: R,
62 mut buffer: B,
63 ) -> io::Result<DecodeReaderBytes<R, B>> {
64 if buffer.as_mut().len() < 4 {
65 let msg = format!(
66 "DecodeReaderBytesBuilder: buffer of size {} is too small",
67 buffer.as_mut().len(),
68 );
69 return Err(io::Error::new(io::ErrorKind::Other, msg));
70 }
71 let encoding =
72 self.encoding.map(|enc| enc.new_decoder_with_bom_removal());
73
74
75
76 let has_detected =
77 !self.bom_sniffing || (!self.bom_override && encoding.is_some());
78
79 let peeker = if self.strip_bom {
80 BomPeeker::without_bom(rdr)
81 } else {
82 BomPeeker::with_bom(rdr)
83 };
84 Ok(DecodeReaderBytes {
85 rdr: peeker,
86 decoder: encoding,
87 tiny: TinyTranscoder::new(),
88 utf8_passthru: self.utf8_passthru,
89 buf: buffer,
90 buflen: 0,
91 pos: 0,
92 has_detected: has_detected,
93 exhausted: false,
94 })
95 }
96
97 /// Set an explicit encoding to be used by this decoder.
98 ///
99 /// When an explicit encoding is set, BOM sniffing is disabled and the
100 /// encoding provided will be used unconditionally. Errors in the encoded
101 /// bytes are replaced by the Unicode replacement codepoint.
102 ///
103 /// By default, no explicit encoding is set.
104 pub fn encoding(
105 &mut self,
106 encoding: Option<&'static Encoding>,
107 ) -> &mut DecodeReaderBytesBuilder {
108 self.encoding = encoding;
109 self
110 }
111
112
113 pub fn utf8_passthru(
114 &mut self,
115 yes: bool,
116 ) -> &mut DecodeReaderBytesBuilder {
117 self.utf8_passthru = yes;
118 self
119 }
120
121
122 pub fn strip_bom(&mut self, yes: bool) -> &mut DecodeReaderBytesBuilder {
123 self.strip_bom = yes;
124 self
125 }
126
127 /// Give the highest precedent to the BOM, if one is found.
128 ///
129 /// When this is enabled, and if a BOM is found, then the encoding
130 /// indicated by that BOM is used even if an explicit encoding has been
131 /// set via the `encoding` method.
132 ///
133 /// This does not override `utf8_passthru`.
134 ///
135 /// This is disabled by default.
136 pub fn bom_override(
137 &mut self,
138 yes: bool,
139 ) -> &mut DecodeReaderBytesBuilder {
140 self.bom_override = yes;
141 self
142 }
143
144 /// Enable BOM sniffing
145 ///
146 /// When this is enabled and an explicit encoding is not set, the decoder
147 /// will try to detect the encoding with BOM.
148 ///
149 /// When this is disabled and an explicit encoding is not set, the decoder
150 /// will treat the input as raw bytes. The bytes will be passed through
151 /// unchanged, including any BOM that may be present.
152 ///
153 /// This is enabled by default.
154 pub fn bom_sniffing(
155 &mut self,
156 yes: bool,
157 ) -> &mut DecodeReaderBytesBuilder {
158 self.bom_sniffing = yes;
159 self
160 }
161}
162
163/// An implementation of `io::Read` that transcodes to UTF-8 in a streaming
164/// fashion.
165///
166/// The high level goal of this decoder is to provide access to byte streams
167/// that are assumed to be UTF-8 unless an encoding is otherwise specified
168/// (either via a BOM or via an explicit designation of an encoding).
169///
170/// When no explicit source encoding is specified (via
171/// `DecodeReaderBytesBuilder`), the source encoding is determined by
172/// inspecting the BOM from the stream read from `R`, if one exists. If a
173/// UTF-16 BOM exists, then the source stream is transcoded to UTF-8 with
174/// invalid UTF-16 sequences translated to the Unicode replacement character.
175/// Similarly if a UTF-8 BOM is seen. In all other cases, the source of the
176/// underlying reader is passed through unchanged _as if_ it were UTF-8.
177///
178/// Since this particular reader does not guarantee providing valid UTF-8 to
179/// the caller, the caller must be prepared to handle invalid UTF-8 itself.
180///
181/// `R` is the type of the underlying reader and `B` is the type of an internal
182/// buffer used to store the results of transcoding. Callers may elect to reuse
183/// the internal buffer via the `DecodeReaderBytesBuilder::build_with_buffer`
184/// constructor.
185pub struct DecodeReaderBytes<R, B> {
186 /// The underlying reader, wrapped in a peeker for reading a BOM if one
187 /// exists.
188 rdr: BomPeeker<R>,
189 /// The underlying text decoder derived from the BOM or an explicitly
190 /// specified encoding, if one exists.
191 decoder: Option<Decoder>,
192 /// A "tiny transcoder" for use when a caller provides a buffer that is
193 /// too small to write at least one UTF-8 encoded codepoint to.
194 tiny: TinyTranscoder,
195 /// When enabled, if a UTF-8 BOM is observed, then the bytes are passed
196 /// through from the underlying reader as-is instead of passing through
197 /// the UTF-8 transcoder (which will replace invalid sequences with the
198 /// REPLACEMENT CHARACTER).
199 utf8_passthru: bool,
200 /// The internal buffer to store transcoded bytes before they are read by
201 /// callers.
202 buf: B,
203 /// The current position in `buf`. Subsequent reads start here.
204 pos: usize,
205 /// The number of transcoded bytes in `buf`. Subsequent reads end here.
206 buflen: usize,
207 /// Whether BOM detection has been performed yet or not.
208 has_detected: bool,
209 /// Whether the underlying reader has been exhausted or not.
210 exhausted: bool,
211}
212
213impl<R: Read, B: AsMut<[u8]>> Read for DecodeReaderBytes<R, B> {
214 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
215 self.detect()?;
216 if self.decoder.is_none() {
217 self.rdr.read(buf)
218 } else {
219 self.transcode(buf)
220 }
221 }
222}
223
224impl<R: Read> DecodeReaderBytes<R, Vec<u8>> {
225
226 pub fn new(rdr: R) -> DecodeReaderBytes<R, Vec<u8>> {
227 DecodeReaderBytesBuilder::new().build(rdr)
228 }
229}
230impl<R: Read, B: AsMut<[u8]>> DecodeReaderBytes<R, B> {
231
232 fn transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
233 if self.exhausted || buf.is_empty() {
234 return Ok(0);
235 }
236 let nwrite = self.tiny.read(buf)?;
237 if nwrite > 0 {
238 // We could technically mush on if the caller provided buffer is
239 // big enough, but to keep things we simple, we satisfy the
240 // contract and quit.
241 return Ok(nwrite);
242 }
243 if self.pos >= self.buflen {
244 self.fill()?;
245 }
246 if buf.len() < 4 {
247 return self.tiny_transcode(buf);
248 }
249 loop {
250 let (_, nin, nout, _) =
251 self.decoder.as_mut().unwrap().decode_to_utf8(
252 &self.buf.as_mut()[self.pos..self.buflen],
253 buf,
254 false,
255 );
256 self.pos += nin;
257 // If we've written at least one byte to the caller-provided
258 // buffer, then our mission is complete.
259 if nout > 0 {
260 return Ok(nout);
261 }
262 // Otherwise, we know that our internal buffer has insufficient
263 // data to transcode at least one char, so we attempt to refill it.
264 self.fill()?;
265 // ... but quit on EOF.
266 if self.buflen == 0 {
267 let (_, _, nout, _) = self
268 .decoder
269 .as_mut()
270 .unwrap()
271 .decode_to_utf8(&[], buf, true);
272 return Ok(nout);
273 }
274 }
275 }
276
277 /// Like transcode, but deals with the case where the caller provided
278 /// buffer is less than 4.
279 fn tiny_transcode(&mut self, buf: &mut [u8]) -> io::Result<usize> {
280 assert!(buf.len() < 4, "have a small caller buffer");
281 loop {
282 let (nin, nout) = self.tiny.transcode(
283 self.decoder.as_mut().unwrap(),
284 &self.buf.as_mut()[self.pos..self.buflen],
285 false,
286 );
287 self.pos += nin;
288 if nout > 0 {
289 // We've satisfied the contract of writing at least one byte,
290 // so we're done. The tiny transcoder is guaranteed to yield
291 // a non-zero number of bytes.
292 return self.tiny.read(buf);
293 }
294 // Otherwise, we know that our internal buffer has insufficient
295 // data to transcode at least one char, so we attempt to refill it.
296 self.fill()?;
297 // ... but quit on EOF.
298 if self.buflen == 0 {
299 self.tiny.transcode(self.decoder.as_mut().unwrap(), &[], true);
300 return self.tiny.read(buf);
301 }
302 }
303 }
304
305 /// Peeks at the underlying reader to look for a BOM. If one exists, then
306 /// an appropriate decoder is created corresponding to the detected BOM.
307 fn detect(&mut self) -> io::Result<()> {
308 if self.has_detected {
309 return Ok(());
310 }
311 self.has_detected = true;
312 let bom = self.rdr.peek_bom()?;
313 if let Some(encoding) = bom.encoding() {
314 // If we got a UTF-8 BOM, and the decoder was configured for
315 // passing through UTF-8, then don't build a decoder at all.
316 if encoding == UTF_8 && self.utf8_passthru {
317 return Ok(());
318 }
319 self.decoder = Some(encoding.new_decoder_with_bom_removal());
320 }
321 Ok(())
322 }
323
324 /// Fill the internal buffer from the underlying reader.
325 ///
326 /// If there are unread bytes in the internal buffer, then we move them
327 /// to the beginning of the internal buffer and fill the remainder.
328 ///
329 /// If the internal buffer is too small to read additional bytes, then an
330 /// error is returned.
331 fn fill(&mut self) -> io::Result<()> {
332 if self.pos < self.buflen {
333 // Despite my best efforts, I could not seem to actually exercise
334 // this code path in tests. Namely, this code path occurs when the
335 // decoder can't make any progress and also doesn't consume all of
336 // the input. Since I'm not sure how to trigger that case, this
337 // code path is actually untested!
338
339 // We can assert this because we require that the caller provided
340 // buffer be at least 4 bytes big.
341 assert!(
342 self.buflen < self.buf.as_mut().len(),
343 "internal buffer should never be exhausted"
344 );
345 let buf = self.buf.as_mut();
346 for (dst, src) in (self.pos..self.buflen).enumerate() {
347 buf[dst] = buf[src];
348 }
349 self.buflen -= self.pos;
350 } else {
351 self.buflen = 0;
352 }
353 self.pos = 0;
354 self.buflen += self.rdr.read(&mut self.buf.as_mut()[self.buflen..])?;
355 if self.buflen == 0 {
356 self.exhausted = true;
357 }
358 Ok(())
359 }
360}
361
362impl<R: fmt::Debug, B: fmt::Debug> fmt::Debug for DecodeReaderBytes<R, B> {
363 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
364 let mut fmter = f.debug_struct("DecodeReaderBytes");
365 fmter
366 .field("rdr", &self.rdr)
367 .field("tiny", &self.tiny)
368 .field("utf8_passthru", &self.utf8_passthru)
369 .field("buf", &self.buf)
370 .field("pos", &self.pos)
371 .field("buflen", &self.buflen)
372 .field("has_detected", &self.has_detected)
373 .field("exhausted", &self.exhausted);
374 // Because `encoding_rs::Decoder` doesn't impl `fmt::Debug`.
375 if let Some(ref d) = self.decoder {
376 let msg = format!("Some(<Decoder for {}>)", d.encoding().name());
377 fmter.field("decoder", &msg);
378 } else {
379 fmter.field("decoder", &"None");
380 }
381 fmter.finish()
382 }
383}
384fn remove_bom_and_convert_to_utf8 <P: AsRef<Path>> (dest: &mut String, filepath: P ) {
385 let mut file = File::create(filepath).unwrap();
386 let buffer = dest.as_bytes() ;
387 file.write_all(&buffer).unwrap();
388 file.flush().unwrap();
389}
390
391fn detect_gbk_encoding_and_transform_utf8<P: AsRef<Path> + Copy>(file_path:P ) -> io::Result<String> {
392 let mut file = File::open(file_path)?;
393 let mut buffer = Vec::new();
394 file.read_to_end(&mut buffer)?;
395
396 // 尝试使用 GBK 解码
397 let (encoding, _, had_errors) = GBK.decode(&buffer);
398 if had_errors {
399 eprintln!("Warning: Some errors occurred during decoding.");
400 }
401 if !encoding.is_empty() {
402 let utf8_content = encoding.to_string();
403
404 let result = utf8_content;
405 // 写入文件
406 let mut output_file = File::create(file_path)?;
407 output_file.write_all(result.as_bytes())?;
408
409 return Ok(result.to_string());
410 }
411
412 // 如果 GBK 检测失败,继续检测 GBK2312
413 let (encoding, _, _) = WINDOWS_1252.decode(&buffer);
414 if !encoding.is_empty() {
415 let utf8_content = encoding.to_string();
416 let mut output_file = File::create(file_path)?;
417 output_file.write_all(utf8_content.as_bytes())?;
418
419 let result = utf8_content;
420 return Ok(result.to_string());
421 }
422
423 Ok("Unknown encoding".to_string())
424}
425
426pub fn translate_all_encoded_mode_file_to_utf8<P: AsRef<Path> + Copy>(file_path :P ) -> Result<(), Box<dyn Error>> {
427 let file = std::fs::File::open(&file_path)?;
428 let source_data = std::io::BufReader::new(file);
429 let mut decoder = DecodeReaderBytes::new(source_data);
430
431 let mut dest = String::new();
432 let result = decoder.read_to_string(&mut dest);
433 if result.is_err() {
434 eprintln!("Error decoding file");
435 dest = detect_gbk_encoding_and_transform_utf8(file_path)?;
436 } else {
437 remove_bom_and_convert_to_utf8(&mut dest, file_path);
438 }
439
440 Ok(())
441
442}
443