local_encoding/
windows.rs

1//! 8-bit string converters for Windows systems.
2extern crate winapi;
3extern crate kernel32;
4
5use std::ptr;
6use std::io::{Error, ErrorKind, Result};
7use std::ffi::OsStr;
8use std::os::windows::ffi::OsStrExt;
9use self::winapi::{BOOL, DWORD};
10use super::Encoder;
11
12/// Always use precomposed characters, that is, characters having a single character value for
13/// a base or nonspacing character combination.
14pub const MB_PRECOMPOSED: DWORD = 0x00000001;
15/// Always use decomposed characters, that is, characters in which a base character and one or more
16/// nonspacing characters each have distinct code point values.
17pub const MB_COMPOSITE: DWORD = 0x00000002;
18/// Use glyph characters instead of control characters.
19pub const MB_USEGLYPHCHARS: DWORD = 0x00000004;
20/// Fail if an invalid input character is encountered.
21pub const MB_ERR_INVALID_CHARS: DWORD = 0x00000008;
22/// Convert composite characters, consisting of a base character and a nonspacing character,
23/// each with different character values.
24pub const WC_COMPOSITECHECK: DWORD = 0x00000200;
25/// Discard nonspacing characters during conversion.
26pub const WC_DISCARDNS: DWORD = 0x00000010;
27/// Default. Generate separate characters during conversion.
28pub const WC_SEPCHARS: DWORD = 0x00000020;
29/// Replace exceptions with the default character during conversion.
30pub const WC_DEFAULTCHAR: DWORD = 0x00000040;
31/// Fail if an invalid input character is encountered.
32pub const WC_ERR_INVALID_CHARS: DWORD = 0x00000080;
33/// Translate any Unicode characters that do not translate directly to multibyte equivalents to
34/// the default character specified by lpDefaultChar.
35pub const WC_NO_BEST_FIT_CHARS: DWORD = 0x00000400;
36
37/// Encoding for use WinAPI calls: MultiByteToWideChar and WideCharToMultiByte.
38pub struct EncoderCodePage(pub u32);
39
40impl Encoder for EncoderCodePage {
41    ///     Convert from bytes to string.
42    fn to_string(self: &Self, data: &[u8]) -> Result<String> {
43        multi_byte_to_wide_char(self.0, MB_ERR_INVALID_CHARS, data)
44    }
45
46    /// Convert from string to bytes.
47    fn to_bytes(self: &Self, data: &str) -> Result<Vec<u8>> {
48        string_to_multibyte(self.0, data, None)
49    }
50}
51
52/// Convert String to 8-bit string.
53///
54/// * `codepage`     - Code page to use in performing the conversion. This parameter can be set to
55///                    the value of any code page that is installed or available in the operating
56///                    system.
57/// * `data`         - Source string.
58/// * `default_char` - Optional character for replace to use if a character cannot be represented
59///                    in the specified code page.
60///
61/// Returns `Err` if an invalid input character is encountered and `default_char` is `None`.
62pub fn string_to_multibyte(codepage: DWORD,
63                           data: &str,
64                           default_char: Option<u8>)
65                           -> Result<Vec<u8>> {
66    let wstr: Vec<u16> = OsStr::new(data).encode_wide().collect();
67    wide_char_to_multi_byte(codepage,
68                            WC_COMPOSITECHECK,
69                            &wstr,
70                            default_char,
71                            default_char.is_none())
72        .and_then(|(data, invalid)| if invalid {
73            Err(Error::new(ErrorKind::InvalidInput,
74                           "Can't convert some characters to multibyte charset"))
75        } else {
76            Ok(data)
77        })
78}
79
80/// Wrapper for MultiByteToWideChar.
81///
82/// See https://msdn.microsoft.com/en-us/library/windows/desktop/dd319072(v=vs.85).aspx
83/// for more details.
84pub fn multi_byte_to_wide_char(codepage: DWORD,
85                               flags: DWORD,
86                               multi_byte_str: &[u8])
87                               -> Result<String> {
88    // Empty string
89    if multi_byte_str.len() == 0 {
90        return Ok(String::new());
91    }
92    unsafe {
93        // Get length of UTF-16 string
94        let len = kernel32::MultiByteToWideChar(codepage,
95                                                flags,
96                                                multi_byte_str.as_ptr() as winapi::LPSTR,
97                                                multi_byte_str.len() as i32,
98                                                ptr::null_mut(),
99                                                0);
100        if len > 0 {
101            // Convert to UTF-16
102            let mut wstr: Vec<u16> = Vec::with_capacity(len as usize);
103            wstr.set_len(len as usize);
104            let len = kernel32::MultiByteToWideChar(codepage,
105                                                    flags,
106                                                    multi_byte_str.as_ptr() as winapi::LPSTR,
107                                                    multi_byte_str.len() as i32,
108                                                    wstr.as_mut_ptr(),
109                                                    len);
110            if len > 0 {
111                return String::from_utf16(&wstr[0..(len as usize)])
112                    .map_err(|e| Error::new(ErrorKind::InvalidInput, e));
113            }
114        }
115        Err(Error::last_os_error())
116    }
117
118}
119
120/// Wrapper for WideCharToMultiByte.
121///
122/// See https://msdn.microsoft.com/ru-ru/library/windows/desktop/dd374130(v=vs.85).aspx
123/// for more details.
124pub fn wide_char_to_multi_byte(codepage: DWORD,
125                               flags: DWORD,
126                               wide_char_str: &[u16],
127                               default_char: Option<u8>,
128                               use_default_char_flag: bool)
129                               -> Result<(Vec<u8>, bool)> {
130    // Empty string
131    if wide_char_str.len() == 0 {
132        return Ok((Vec::new(), false));
133    }
134    unsafe {
135        // Get length of multibyte string
136        let len = kernel32::WideCharToMultiByte(codepage,
137                                                flags,
138                                                wide_char_str.as_ptr(),
139                                                wide_char_str.len() as i32,
140                                                ptr::null_mut(),
141                                                0,
142                                                ptr::null(),
143                                                ptr::null_mut());
144
145        if len > 0 {
146            // Convert from UTF-16 to multibyte
147            let mut astr: Vec<u8> = Vec::with_capacity(len as usize);
148            astr.set_len(len as usize);
149            let default_char_ref: [i8; 1] = match default_char {
150                Some(c) => [c as i8],
151                None => [0],
152            };
153            let mut use_char_ref: [BOOL; 1] = [0];
154            let len = kernel32::WideCharToMultiByte(codepage,
155                                                    flags,
156                                                    wide_char_str.as_ptr(),
157                                                    wide_char_str.len() as i32,
158                                                    astr.as_mut_ptr() as winapi::LPSTR,
159                                                    len,
160                                                    match default_char {
161                                                        Some(_) => default_char_ref.as_ptr(),
162                                                        None => ptr::null(),
163                                                    },
164                                                    match use_default_char_flag {
165                                                        true => use_char_ref.as_mut_ptr(),
166                                                        false => ptr::null_mut(),
167                                                    });
168            if (len as usize) == astr.len() {
169                return Ok((astr, use_char_ref[0] != 0));
170            }
171            if len > 0 {
172                return Ok((astr[0..(len as usize)].to_vec(), use_char_ref[0] != 0));
173            }
174        }
175        Err(Error::last_os_error())
176    }
177}
178
179#[test]
180fn multi_byte_to_wide_char_empty() {
181    assert_eq!(multi_byte_to_wide_char(winapi::CP_ACP, MB_ERR_INVALID_CHARS, b"").unwrap(),
182               "");
183}
184
185#[test]
186fn multi_byte_to_wide_char_ascii() {
187    assert_eq!(multi_byte_to_wide_char(winapi::CP_ACP, MB_ERR_INVALID_CHARS, b"Test").unwrap(),
188               "Test");
189}
190
191#[test]
192fn multi_byte_to_wide_char_utf8() {
193    assert_eq!(multi_byte_to_wide_char(winapi::CP_UTF8,
194                                       MB_ERR_INVALID_CHARS,
195                                       b"\xD0\xA2\xD0\xB5\xD1\x81\xD1\x82")
196                   .unwrap(),
197               "Тест");
198}
199
200#[test]
201fn multi_byte_to_wide_char_invalid() {
202    assert!(multi_byte_to_wide_char(winapi::CP_UTF8, MB_ERR_INVALID_CHARS, b"Test\xC0").is_err());
203}
204
205#[test]
206fn wide_char_to_multi_byte_empty() {
207    assert_eq!(wide_char_to_multi_byte(winapi::CP_UTF8, WC_ERR_INVALID_CHARS, &[], None, false)
208                   .unwrap(),
209               (b"".to_vec(), false));
210}
211
212#[test]
213fn wide_char_to_multi_byte_ascii() {
214    assert_eq!(wide_char_to_multi_byte(winapi::CP_ACP,
215                                       WC_COMPOSITECHECK,
216                                       &[0x0054, 0x0065, 0x0073, 0x0074],
217                                       None,
218                                       true)
219                   .unwrap(),
220               (b"Test".to_vec(), false));
221}
222
223#[test]
224fn wide_char_to_multi_byte_utf8() {
225    assert_eq!(wide_char_to_multi_byte(winapi::CP_UTF8,
226                                       WC_ERR_INVALID_CHARS,
227                                       &[0x6F22],
228                                       None,
229                                       false)
230                   .unwrap(),
231               (b"\xE6\xBC\xA2".to_vec(), false));
232}
233
234#[test]
235fn wide_char_to_multi_byte_replace() {
236    assert_eq!(wide_char_to_multi_byte(winapi::CP_ACP,
237                                       WC_DEFAULTCHAR | WC_COMPOSITECHECK,
238                                       &[0x0054, 0x0065, 0x0073, 0x0074, 0x6F22, 0x0029],
239                                       Some(b':'),
240                                       true)
241                   .unwrap(),
242               (b"Test:)".to_vec(), true));
243}
244
245#[test]
246fn wide_char_to_multi_byte_invalid() {
247    assert_eq!(wide_char_to_multi_byte(winapi::CP_ACP,
248                                       WC_COMPOSITECHECK,
249                                       &[0x6F22],
250                                       Some(b':'),
251                                       true)
252                   .unwrap(),
253               (b":".to_vec(), true));
254    assert_eq!(wide_char_to_multi_byte(winapi::CP_ACP,
255                                       WC_COMPOSITECHECK,
256                                       &[0x0020],
257                                       Some(b':'),
258                                       true)
259                   .unwrap(),
260               (b" ".to_vec(), false));
261}
262
263#[cfg(test)]
264mod tests {
265    extern crate winapi;
266
267    use super::*;
268    use super::super::Encoder;
269
270    #[test]
271    fn cp1251_to_string_test() {
272        assert_eq!(EncoderCodePage(1251).to_string(b"\xD2\xE5\xF1\xF2").unwrap(),
273                   "Тест");
274    }
275    #[test]
276    fn string_to_cp1251_test() {
277        assert_eq!(EncoderCodePage(1251).to_bytes("Тест").unwrap(),
278                   b"\xD2\xE5\xF1\xF2");
279    }
280
281    #[test]
282    fn cp866_to_string_test() {
283        assert_eq!(EncoderCodePage(866).to_string(b"\x92\xA5\xE1\xE2").unwrap(),
284                   "Тест");
285    }
286
287    #[test]
288    fn string_to_cp866_test() {
289        assert_eq!(EncoderCodePage(866).to_bytes("Тест").unwrap(),
290                   b"\x92\xA5\xE1\xE2");
291    }
292}