codepage_strings/lib.rs
1#![doc(html_root_url = "https://docs.rs/codepage-strings/1.0.2")]
2
3/*!
4This Rust crate builds on the excellent work of the
5[`encoding_rs`], [`codepage`], and [`oem-cp`] crates in an attempt
6to provide idiomatic encoding and decoding of strings coded
7according to
8[Windows code pages](https://en.wikipedia.org/wiki/Windows_code_page).
9
10Because Windows code pages are a legacy rathole, it is
11difficult to transcode strings using them. Sadly, there are
12still a lot of files out there that use these encodings.
13This crate was specifically created for use with
14[RIFF](https://www.aelius.com/njh/wavemetatools/doc/riffmci.pdf),
15a file format that has code pages baked in for text
16internationalization.
17
18No effort has been made to deal with Windows code pages
19beyond those supported by [`codepage`] and [`oem-cp`]. If the
20single-byte codepage you need is missing, I suggest taking a
21look at adding it to [`oem-cp`], which seems to be the main
22Rust repository for unusual Windows code page tables. I
23believe that most of the single-byte code pages supported by
24`iconv` are dealt with here, but I haven't checked
25carefully.
26
27Other than UTF-16LE and UTF-16BE, multibyte Windows code
28pages are not (for now) currently supported — in particular
29various Asian languages. Code page 65001 (UTF-8) is
30supported as an identity transformation. UTF-32LE and
31UTF32-Be are not supported. EBCDIC code pages and UTF-7 are
32not supported and are low priority, because seriously?
33
34No particular effort has been put into performance. The
35interface allows [`std::borrow::Cow`] to some extent, but this
36is limited by the minor impedance mismatches between
37[`encoding_rs`] and [`oem-cp`].
38
39# Examples
40
41Do some string conversions on Windows code page 869
42(alternate Greek).
43
44```rust
45# use codepage_strings::*;
46# fn main() -> Result<(), Box<dyn std::error::Error>> {
47let coding = Coding::new(869)?;
48assert_eq!(
49 coding.encode("αβ")?,
50 vec![214, 215],
51);
52assert_eq!(
53 coding.decode(&[214, 215])?,
54 "αβ",
55);
56assert_eq!(
57 coding.decode_lossy(&[214, 147]),
58 "α\u{fffd}",
59);
60assert_eq!(
61 coding.decode(&[214, 147]),
62 Err(ConvertError::StringDecoding),
63);
64# Ok(())
65# }
66```
67
68[`encoding_rs`]: http://crates.io/crates/encoding_rs
69[`codepage`]: http://crates.io/crates/codepage
70[`oem-cp`]: http://crates.io/crates/oem-cp
71[`std::borrow::Cow`]: https://doc.rust-lang.org/nightly/alloc/borrow/enum.Cow.html
72*/
73
74use std::borrow::Cow;
75
76/// Errors that can result from various conversions.
77#[non_exhaustive]
78#[derive(Debug, Clone, Copy, PartialEq, Eq)]
79pub enum ConvertError {
80 /// Could not encode string as requested.
81 StringEncoding,
82 /// Could not decode string as requested.
83 StringDecoding,
84 /// Requested a Windows code page the library doesn't understand.
85 UnknownCodepage,
86 /// Requested a Windows code page the library can't do.
87 UnsupportedCodepage,
88}
89
90impl std::fmt::Display for ConvertError {
91 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
92 let msg = match self {
93 ConvertError::StringEncoding => "string codepage encoding error",
94 ConvertError::StringDecoding => "string decoding error",
95 ConvertError::UnknownCodepage => "invalid / unknown Windows code page",
96 ConvertError::UnsupportedCodepage => "cannot transcode this Windows code page",
97 };
98 write!(f, "{}", msg)
99 }
100}
101
102impl std::error::Error for ConvertError {}
103
104#[derive(Debug, Clone, Copy, PartialEq, Eq)]
105enum Endian {
106 Le,
107 Be,
108}
109
110#[derive(Debug, Clone)]
111enum Codings {
112 Ers(&'static encoding_rs::Encoding),
113 OemCp {
114 encode: &'static oem_cp::OEMCPHashMap<char, u8>,
115 decode: &'static oem_cp::code_table_type::TableType,
116 },
117 Identity,
118 UTF16(Endian),
119}
120
121/// Coding information derived from a Windows code page.
122#[derive(Debug, Clone)]
123pub struct Coding(Codings);
124
125impl Coding {
126 /// Get an encoding for the given code page.
127 ///
128 /// # Errors
129 ///
130 /// Will fail with [`ConvertError::UnknownCodepage`] or
131 /// [`ConvertError::UnsupportedCodepage`] if an encoding
132 /// for the given page is unavailable.
133 pub fn new(cp: u16) -> Result<Self, ConvertError> {
134 if cp == 65001 {
135 // UTF-8
136 return Ok(Coding(Codings::Identity));
137 }
138 if cp == 1200 {
139 // UTF-16LE
140 return Ok(Coding(Codings::UTF16(Endian::Le)));
141 }
142 if cp == 1201 {
143 // UTF-16BE
144 return Ok(Coding(Codings::UTF16(Endian::Be)));
145 }
146 if [12000, 12001, 65000].contains(&cp) {
147 // Weird UTF format (UTF-32LE, UTF-32BE, UTF-7).
148 return Err(ConvertError::UnsupportedCodepage);
149 }
150 if let Some(c) = codepage::to_encoding(cp) {
151 return Ok(Coding(Codings::Ers(c)));
152 }
153 let encode = match (*oem_cp::code_table::ENCODING_TABLE_CP_MAP).get(&cp) {
154 Some(e) => e,
155 None => return Err(ConvertError::UnknownCodepage),
156 };
157 let decode = match (*oem_cp::code_table::DECODING_TABLE_CP_MAP).get(&cp) {
158 Some(e) => e,
159 None => return Err(ConvertError::UnknownCodepage),
160 };
161 Ok(Coding(Codings::OemCp { encode, decode }))
162 }
163
164 /// Encode a UTF-8 string into a byte vector according
165 /// to this encoding.
166 ///
167 /// # Errors
168 ///
169 /// Returns [`ConvertError::StringEncoding`] if any
170 /// character cannot be encoded.
171 pub fn encode<'a, S>(&self, src: S) -> Result<Vec<u8>, ConvertError>
172 where
173 S: Into<Cow<'a, str>>,
174 {
175 match self.0 {
176 Codings::Ers(c) => {
177 let src = src.into();
178 let oe = c.output_encoding();
179 let (out, _, fail) = oe.encode(src.as_ref());
180 if fail {
181 Err(ConvertError::StringEncoding)
182 } else {
183 Ok(out.to_owned().to_vec())
184 }
185 }
186 Codings::OemCp { encode: et, .. } => match oem_cp::encode_string_checked(src, et) {
187 Some(out) => Ok(out),
188 None => Err(ConvertError::StringEncoding),
189 },
190 Codings::Identity => Ok(src.into().as_ref().as_bytes().to_vec()),
191 Codings::UTF16(e) => {
192 let encoded = src
193 .into()
194 .as_ref()
195 .encode_utf16()
196 .flat_map(|w| {
197 let lo = (w & 0xff) as u8;
198 let hi = (w >> 8) as u8;
199 let bs: Vec<u8> = match e {
200 Endian::Le => vec![lo, hi],
201 Endian::Be => vec![hi, lo],
202 };
203 bs.into_iter()
204 })
205 .collect();
206 Ok(encoded)
207 }
208 }
209 }
210
211 /// Decode a byte vector into UTF-8 [`Cow`]`<`[`str`]`>` according
212 /// to this encoding.
213 ///
214 /// # Errors
215 ///
216 /// Returns [`ConvertError::StringDecoding`] if any
217 /// character cannot be decoded.
218 pub fn decode<'a>(&self, src: &'a [u8]) -> Result<Cow<'a, str>, ConvertError> {
219 match self.0 {
220 Codings::Ers(c) => {
221 let (out, _, fail) = c.decode(src.as_ref());
222 if fail {
223 Err(ConvertError::StringDecoding)
224 } else {
225 Ok(out)
226 }
227 }
228 Codings::OemCp { decode: dt, .. } => match dt.decode_string_checked(src) {
229 Some(s) => Ok(Cow::from(s)),
230 None => Err(ConvertError::StringDecoding),
231 },
232 Codings::Identity => match std::str::from_utf8(src) {
233 Ok(s) => Ok(Cow::from(s)),
234 Err(_) => Err(ConvertError::StringDecoding),
235 },
236 Codings::UTF16(e) => {
237 let ws = src
238 .chunks(2)
239 .map(|bs| {
240 if bs.len() < 2 {
241 return Err(ConvertError::StringDecoding);
242 }
243 let (hi, lo) = (bs[0] as u16, bs[1] as u16);
244 match e {
245 Endian::Le => Ok((lo << 8) | hi),
246 Endian::Be => Ok((hi << 8) | lo),
247 }
248 })
249 .collect::<Result<Vec<u16>, ConvertError>>()?;
250 match String::from_utf16(&ws) {
251 Ok(s) => Ok(Cow::from(s)),
252 Err(_) => Err(ConvertError::StringDecoding),
253 }
254 }
255 }
256 }
257
258 /// Decode a byte vector into UTF-8 [`Cow`]`<`[`str`]`>` according
259 /// to this encoding. Replace any bytes that cannot be
260 /// encoded with the Unicode
261 /// "[replacement character](https://en.wikipedia.org/wiki/Specials_%28Unicode_block%29#Replacement_character)"
262 /// (`\u{fffd}`).
263 pub fn decode_lossy<'a>(&self, src: &'a [u8]) -> Cow<'a, str> {
264 match self.0 {
265 Codings::Ers(c) => {
266 let (out, _, _) = c.decode(src.as_ref());
267 out
268 }
269 Codings::OemCp { decode: dt, .. } => Cow::from(dt.decode_string_lossy(src)),
270 Codings::Identity => match std::str::from_utf8(src) {
271 Ok(s) => Cow::from(s),
272 Err(_) => String::from_utf8_lossy(src),
273 },
274 Codings::UTF16(e) => {
275 let ws: Vec<u16> = src
276 .chunks(2)
277 .map(|bs| {
278 let (hi, lo) = if bs.len() == 1 {
279 // Unicode replacement character.
280 (0xff, 0xfd)
281 } else {
282 // Big-endian by default.
283 (bs[0] as u16, bs[1] as u16)
284 };
285 match e {
286 Endian::Le => (lo << 8) | hi,
287 Endian::Be => (hi << 8) | lo,
288 }
289 })
290 .collect();
291 Cow::from(String::from_utf16_lossy(&ws))
292 }
293 }
294 }
295}