rc_zip/
encoding.rs

1//! Character encodings used in ZIP files.
2//!
3//! ZIP entry paths may be encoded in a variety of character encodings:
4//! historically, CP-437 was used, but many modern zip files use UTF-8 with an
5//! optional UTF-8 flag.
6//!
7//! Others use the system's local character encoding, and we have no choice but
8//! to make an educated guess thanks to the chardet-ng crate.
9
10use std::fmt;
11
12/// Encodings supported by this crate
13#[derive(Clone, Copy, PartialEq, Eq, Debug)]
14pub enum Encoding {
15    /// [UTF-8](https://en.wikipedia.org/wiki/UTF-8), opt-in for ZIP files.
16    Utf8,
17
18    /// [Codepage 437](https://en.wikipedia.org/wiki/Code_page_437), also known as
19    /// OEM-US, PC-8, or DOS Latin US.
20    ///
21    /// This is the fallback if UTF-8 is not specified and no other encoding
22    /// is auto-detected. It was the original encoding of the zip format.
23    Cp437,
24
25    /// [Shift JIS](https://en.wikipedia.org/wiki/Shift_JIS), also known as SJIS.
26    ///
27    /// Still in use by some Japanese users as of 2019.
28    ShiftJis,
29}
30
31impl fmt::Display for Encoding {
32    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
33        use Encoding as T;
34        match self {
35            T::Utf8 => write!(f, "utf-8"),
36            T::Cp437 => write!(f, "cp-437"),
37            T::ShiftJis => write!(f, "shift-jis"),
38        }
39    }
40}
41
42/// Errors encountered while converting text to UTF-8.
43#[derive(Debug)]
44pub enum DecodingError {
45    /// Text claimed to be UTF-8, but wasn't (as far as we can tell).
46    Utf8Error(std::str::Utf8Error),
47
48    /// Text is too large to be converted.
49    ///
50    /// In practice, this happens if the text's length is larger than
51    /// [usize::MAX], which seems unlikely.
52    StringTooLarge,
53
54    /// Text is not valid in the given encoding.
55    EncodingError(&'static str),
56}
57
58impl From<std::str::Utf8Error> for DecodingError {
59    fn from(e: std::str::Utf8Error) -> Self {
60        DecodingError::Utf8Error(e)
61    }
62}
63
64impl fmt::Display for DecodingError {
65    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
66        match self {
67            Self::Utf8Error(utf8) => write!(f, "invalid utf-8: {utf8}"),
68            Self::StringTooLarge => f.write_str("text too large to be converted"),
69            Self::EncodingError(enc) => write!(f, "encoding error: {enc}"),
70        }
71    }
72}
73
74impl std::error::Error for DecodingError {}
75
76impl Encoding {
77    pub(crate) fn decode(&self, i: &[u8]) -> Result<String, DecodingError> {
78        match self {
79            Encoding::Utf8 => {
80                let s = std::str::from_utf8(i)?;
81                Ok(s.to_string())
82            }
83            Encoding::Cp437 => Ok(oem_cp::decode_string_complete_table(
84                i,
85                &oem_cp::code_table::DECODING_TABLE_CP437,
86            )),
87            Encoding::ShiftJis => self.decode_as(i, encoding_rs::SHIFT_JIS),
88        }
89    }
90
91    fn decode_as(
92        &self,
93        i: &[u8],
94        encoding: &'static encoding_rs::Encoding,
95    ) -> Result<String, DecodingError> {
96        let mut decoder = encoding.new_decoder();
97        let len = decoder
98            .max_utf8_buffer_length(i.len())
99            .ok_or(DecodingError::StringTooLarge)?;
100        let mut v = vec![0u8; len];
101        let last = true;
102        let (_decoder_result, _decoder_read, decoder_written, had_errors) =
103            decoder.decode_to_utf8(i, &mut v, last);
104        if had_errors {
105            return Err(DecodingError::EncodingError(encoding.name()));
106        }
107        v.resize(decoder_written, 0u8);
108        Ok(unsafe { String::from_utf8_unchecked(v) })
109    }
110}
111
112// detect_utf8 reports whether s is a valid UTF-8 string, and whether the string
113// must be considered UTF-8 encoding (i.e., not compatible with CP-437, ASCII,
114// or any other common encoding).
115pub(crate) fn detect_utf8(input: &[u8]) -> (bool, bool) {
116    match std::str::from_utf8(input) {
117        Err(_) => {
118            // not valid utf-8
119            (false, false)
120        }
121        Ok(s) => {
122            let mut require = false;
123
124            // Officially, ZIP uses CP-437, but many readers use the system's
125            // local character encoding. Most encoding are compatible with a large
126            // subset of CP-437, which itself is ASCII-like.
127            //
128            // Forbid 0x7e and 0x5c since EUC-KR and Shift-JIS replace those
129            // characters with localized currency and overline characters.
130            for c in s.chars() {
131                if c < 0x20 as char || c > 0x7d as char || c == 0x5c as char {
132                    require = true
133                }
134            }
135            (true, require)
136        }
137    }
138}