codepage_strings/
lib.rs

1#![doc(html_root_url = "https://docs.rs/codepage-strings/1.0.2")]
2
3/*!
4This Rust crate builds on the excellent work of the
5[`encoding_rs`], [`codepage`], and [`oem-cp`] crates in an attempt
6to provide idiomatic encoding and decoding of strings coded
7according to
8[Windows code pages](https://en.wikipedia.org/wiki/Windows_code_page).
9
10Because Windows code pages are a legacy rathole, it is
11difficult to transcode strings using them. Sadly, there are
12still a lot of files out there that use these encodings.
13This crate was specifically created for use with
14[RIFF](https://www.aelius.com/njh/wavemetatools/doc/riffmci.pdf),
15a file format that has code pages baked in for text
16internationalization.
17
18No effort has been made to deal with Windows code pages
19beyond those supported by [`codepage`] and [`oem-cp`]. If the
20single-byte codepage you need is missing, I suggest taking a
21look at adding it to [`oem-cp`], which seems to be the main
22Rust repository for unusual Windows code page tables. I
23believe that most of the single-byte code pages supported by
24`iconv` are dealt with here, but I haven't checked
25carefully.
26
27Other than UTF-16LE and UTF-16BE, multibyte Windows code
28pages are not (for now) currently supported — in particular
29various Asian languages. Code page 65001 (UTF-8) is
30supported as an identity transformation.  UTF-32LE and
31UTF32-Be are not supported. EBCDIC code pages and UTF-7 are
32not supported and are low priority, because seriously?
33
34No particular effort has been put into performance. The
35interface allows [`std::borrow::Cow`] to some extent, but this
36is limited by the minor impedance mismatches between
37[`encoding_rs`] and [`oem-cp`].
38
39# Examples
40
41Do some string conversions on Windows code page 869
42(alternate Greek).
43
44```rust
45# use codepage_strings::*;
46# fn main() -> Result<(), Box<dyn std::error::Error>> {
47let coding = Coding::new(869)?;
48assert_eq!(
49    coding.encode("αβ")?,
50    vec![214, 215],
51);
52assert_eq!(
53    coding.decode(&[214, 215])?,
54    "αβ",
55);
56assert_eq!(
57    coding.decode_lossy(&[214, 147]),
58    "α\u{fffd}",
59);
60assert_eq!(
61    coding.decode(&[214, 147]),
62    Err(ConvertError::StringDecoding),
63);
64# Ok(())
65# }
66```
67
68[`encoding_rs`]: http://crates.io/crates/encoding_rs
69[`codepage`]: http://crates.io/crates/codepage
70[`oem-cp`]: http://crates.io/crates/oem-cp
71[`std::borrow::Cow`]: https://doc.rust-lang.org/nightly/alloc/borrow/enum.Cow.html
72*/
73
74use std::borrow::Cow;
75
76/// Errors that can result from various conversions.
77#[non_exhaustive]
78#[derive(Debug, Clone, Copy, PartialEq, Eq)]
79pub enum ConvertError {
80    /// Could not encode string as requested.
81    StringEncoding,
82    /// Could not decode string as requested.
83    StringDecoding,
84    /// Requested a Windows code page the library doesn't understand.
85    UnknownCodepage,
86    /// Requested a Windows code page the library can't do.
87    UnsupportedCodepage,
88}
89
90impl std::fmt::Display for ConvertError {
91    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92        let msg = match self {
93            ConvertError::StringEncoding => "string codepage encoding error",
94            ConvertError::StringDecoding => "string decoding error",
95            ConvertError::UnknownCodepage => "invalid / unknown Windows code page",
96            ConvertError::UnsupportedCodepage => "cannot transcode this Windows code page",
97        };
98        write!(f, "{}", msg)
99    }
100}
101
102impl std::error::Error for ConvertError {}
103
104#[derive(Debug, Clone, Copy, PartialEq, Eq)]
105enum Endian {
106    Le,
107    Be,
108}
109
110#[derive(Debug, Clone)]
111enum Codings {
112    Ers(&'static encoding_rs::Encoding),
113    OemCp {
114        encode: &'static oem_cp::OEMCPHashMap<char, u8>,
115        decode: &'static oem_cp::code_table_type::TableType,
116    },
117    Identity,
118    UTF16(Endian),
119}
120
121/// Coding information derived from a Windows code page.
122#[derive(Debug, Clone)]
123pub struct Coding(Codings);
124
125impl Coding {
126    /// Get an encoding for the given code page.
127    ///
128    /// # Errors
129    ///
130    /// Will fail with [`ConvertError::UnknownCodepage`] or
131    /// [`ConvertError::UnsupportedCodepage`] if an encoding
132    /// for the given page is unavailable.
133    pub fn new(cp: u16) -> Result<Self, ConvertError> {
134        if cp == 65001 {
135            // UTF-8
136            return Ok(Coding(Codings::Identity));
137        }
138        if cp == 1200 {
139            // UTF-16LE
140            return Ok(Coding(Codings::UTF16(Endian::Le)));
141        }
142        if cp == 1201 {
143            // UTF-16BE
144            return Ok(Coding(Codings::UTF16(Endian::Be)));
145        }
146        if [12000, 12001, 65000].contains(&cp) {
147            // Weird UTF format (UTF-32LE, UTF-32BE, UTF-7).
148            return Err(ConvertError::UnsupportedCodepage);
149        }
150        if let Some(c) = codepage::to_encoding(cp) {
151            return Ok(Coding(Codings::Ers(c)));
152        }
153        let encode = match (*oem_cp::code_table::ENCODING_TABLE_CP_MAP).get(&cp) {
154            Some(e) => e,
155            None => return Err(ConvertError::UnknownCodepage),
156        };
157        let decode = match (*oem_cp::code_table::DECODING_TABLE_CP_MAP).get(&cp) {
158            Some(e) => e,
159            None => return Err(ConvertError::UnknownCodepage),
160        };
161        Ok(Coding(Codings::OemCp { encode, decode }))
162    }
163
164    /// Encode a UTF-8 string into a byte vector according
165    /// to this encoding.
166    ///
167    /// # Errors
168    ///
169    /// Returns [`ConvertError::StringEncoding`] if any
170    /// character cannot be encoded.
171    pub fn encode<'a, S>(&self, src: S) -> Result<Vec<u8>, ConvertError>
172    where
173        S: Into<Cow<'a, str>>,
174    {
175        match self.0 {
176            Codings::Ers(c) => {
177                let src = src.into();
178                let oe = c.output_encoding();
179                let (out, _, fail) = oe.encode(src.as_ref());
180                if fail {
181                    Err(ConvertError::StringEncoding)
182                } else {
183                    Ok(out.to_owned().to_vec())
184                }
185            }
186            Codings::OemCp { encode: et, .. } => match oem_cp::encode_string_checked(src, et) {
187                Some(out) => Ok(out),
188                None => Err(ConvertError::StringEncoding),
189            },
190            Codings::Identity => Ok(src.into().as_ref().as_bytes().to_vec()),
191            Codings::UTF16(e) => {
192                let encoded = src
193                    .into()
194                    .as_ref()
195                    .encode_utf16()
196                    .flat_map(|w| {
197                        let lo = (w & 0xff) as u8;
198                        let hi = (w >> 8) as u8;
199                        let bs: Vec<u8> = match e {
200                            Endian::Le => vec![lo, hi],
201                            Endian::Be => vec![hi, lo],
202                        };
203                        bs.into_iter()
204                    })
205                    .collect();
206                Ok(encoded)
207            }
208        }
209    }
210
211    /// Decode a byte vector into UTF-8 [`Cow`]`<`[`str`]`>` according
212    /// to this encoding.
213    ///
214    /// # Errors
215    ///
216    /// Returns [`ConvertError::StringDecoding`] if any
217    /// character cannot be decoded.
218    pub fn decode<'a>(&self, src: &'a [u8]) -> Result<Cow<'a, str>, ConvertError> {
219        match self.0 {
220            Codings::Ers(c) => {
221                let (out, _, fail) = c.decode(src.as_ref());
222                if fail {
223                    Err(ConvertError::StringDecoding)
224                } else {
225                    Ok(out)
226                }
227            }
228            Codings::OemCp { decode: dt, .. } => match dt.decode_string_checked(src) {
229                Some(s) => Ok(Cow::from(s)),
230                None => Err(ConvertError::StringDecoding),
231            },
232            Codings::Identity => match std::str::from_utf8(src) {
233                Ok(s) => Ok(Cow::from(s)),
234                Err(_) => Err(ConvertError::StringDecoding),
235            },
236            Codings::UTF16(e) => {
237                let ws = src
238                    .chunks(2)
239                    .map(|bs| {
240                        if bs.len() < 2 {
241                            return Err(ConvertError::StringDecoding);
242                        }
243                        let (hi, lo) = (bs[0] as u16, bs[1] as u16);
244                        match e {
245                            Endian::Le => Ok((lo << 8) | hi),
246                            Endian::Be => Ok((hi << 8) | lo),
247                        }
248                    })
249                    .collect::<Result<Vec<u16>, ConvertError>>()?;
250                match String::from_utf16(&ws) {
251                    Ok(s) => Ok(Cow::from(s)),
252                    Err(_) => Err(ConvertError::StringDecoding),
253                }
254            }
255        }
256    }
257
258    /// Decode a byte vector into UTF-8 [`Cow`]`<`[`str`]`>` according
259    /// to this encoding. Replace any bytes that cannot be
260    /// encoded with the Unicode
261    /// "[replacement character](https://en.wikipedia.org/wiki/Specials_%28Unicode_block%29#Replacement_character)"
262    /// (`\u{fffd}`).
263    pub fn decode_lossy<'a>(&self, src: &'a [u8]) -> Cow<'a, str> {
264        match self.0 {
265            Codings::Ers(c) => {
266                let (out, _, _) = c.decode(src.as_ref());
267                out
268            }
269            Codings::OemCp { decode: dt, .. } => Cow::from(dt.decode_string_lossy(src)),
270            Codings::Identity => match std::str::from_utf8(src) {
271                Ok(s) => Cow::from(s),
272                Err(_) => String::from_utf8_lossy(src),
273            },
274            Codings::UTF16(e) => {
275                let ws: Vec<u16> = src
276                    .chunks(2)
277                    .map(|bs| {
278                        let (hi, lo) = if bs.len() == 1 {
279                            // Unicode replacement character.
280                            (0xff, 0xfd)
281                        } else {
282                            // Big-endian by default.
283                            (bs[0] as u16, bs[1] as u16)
284                        };
285                        match e {
286                            Endian::Le => (lo << 8) | hi,
287                            Endian::Be => (hi << 8) | lo,
288                        }
289                    })
290                    .collect();
291                Cow::from(String::from_utf16_lossy(&ws))
292            }
293        }
294    }
295}