oem_cp/
string.rs

1use alloc::borrow::Cow;
2use alloc::string::String;
3use alloc::vec::Vec;
4use core::convert::Into;
5
6use super::code_table_type::TableType;
7use super::OEMCPHashMap;
8
9use TableType::*;
10
11impl TableType {
12    /// Wrapper function for decoding bytes encoded in SBCSs
13    ///
14    /// This function returns `None` if any bytes bumps into undefined codepoints
15    ///
16    /// # Arguments
17    ///
18    /// * `src` - bytes encoded in SBCS
19    ///
20    /// # Examples
21    ///
22    /// ```
23    /// use oem_cp::code_table::{DECODING_TABLE_CP437, DECODING_TABLE_CP874};
24    /// use oem_cp::code_table_type::TableType;
25    /// use TableType::{Complete,Incomplete};
26    ///
27    /// assert_eq!(Complete(&DECODING_TABLE_CP437).decode_string_checked(vec![0xFB, 0xAC, 0x3D, 0xAB]), Some("√¼=½".to_string()));
28    /// // means shrimp in Thai (U+E49 => 0xE9)
29    /// assert_eq!(Incomplete(&DECODING_TABLE_CP874).decode_string_checked(vec![0xA1, 0xD8, 0xE9, 0xA7]), Some("กุ้ง".to_string()));
30    /// // 0xDB-0xDE,0xFC-0xFF is invalid in CP874 in Windows (strict mode)
31    /// assert_eq!(Incomplete(&DECODING_TABLE_CP874).decode_string_checked(vec![0x30, 0xDB]), None);
32    /// ```
33    pub fn decode_string_checked<'a, T: Into<Cow<'a, [u8]>>>(&self, src: T) -> Option<String> {
34        match self {
35            Complete(table_ref) => Some(decode_string_complete_table(src, table_ref)),
36            Incomplete(table_ref) => decode_string_incomplete_table_checked(src, table_ref),
37        }
38    }
39    /// Wrapper function for decoding bytes encoded in SBCSs
40    ///
41    /// Undefined codepoints are replaced with U+FFFD.
42    ///
43    /// # Arguments
44    ///
45    /// * `src` - bytes encoded in SBCS
46    ///
47    /// # Examples
48    ///
49    /// ```
50    /// use oem_cp::code_table::{DECODING_TABLE_CP437, DECODING_TABLE_CP874};
51    /// use oem_cp::code_table_type::TableType;
52    /// use TableType::{Complete,Incomplete};
53    ///
54    /// assert_eq!(Complete(&DECODING_TABLE_CP437).decode_string_lossy(vec![0xFB, 0xAC, 0x3D, 0xAB]), "√¼=½".to_string());
55    /// // means shrimp in Thai (U+E49 => 0xE9)
56    /// assert_eq!(Incomplete(&DECODING_TABLE_CP874).decode_string_lossy(vec![0xA1, 0xD8, 0xE9, 0xA7]), "กุ้ง".to_string());
57    /// // 0xDB-0xDE,0xFC-0xFF is invalid in CP874 in Windows (strict mode)
58    /// assert_eq!(Incomplete(&DECODING_TABLE_CP874).decode_string_lossy(vec![0x30, 0xDB]), "0\u{FFFD}".to_string());
59    /// ```
60    pub fn decode_string_lossy<'a, T: Into<Cow<'a, [u8]>>>(&self, src: T) -> String {
61        match self {
62            Complete(table_ref) => decode_string_complete_table(src, table_ref),
63            Incomplete(table_ref) => decode_string_incomplete_table_lossy(src, table_ref),
64        }
65    }
66}
67
68/// Decode SBCS (single byte character set) bytes (no undefined codepoints)
69///
70/// # Arguments
71///
72/// * `src` - bytes encoded in SBCS
73/// * `decoding_table` - table for decoding SBCS (with**out** undefined codepoints)
74///
75/// # Examples
76///
77/// ```
78/// use oem_cp::decode_string_complete_table;
79/// use oem_cp::code_table::DECODING_TABLE_CP437;
80///
81/// assert_eq!(&decode_string_complete_table(vec![0xFB, 0xAC, 0x3D, 0xAB], &DECODING_TABLE_CP437), "√¼=½");
82/// ```
83pub fn decode_string_complete_table<'a, T: Into<Cow<'a, [u8]>>>(
84    src: T,
85    decoding_table: &[char; 128],
86) -> String {
87    src.into()
88        .iter()
89        .map(|byte| {
90            if *byte < 128 {
91                *byte as char
92            } else {
93                decoding_table[(*byte & 127) as usize]
94            }
95        })
96        .collect()
97}
98
99/// Decode SBCS (single byte character set) bytes (with undefined codepoints)
100///
101/// If some undefined codepoints are found, returns `None`.
102///
103/// # Arguments
104///
105/// * `src` - bytes encoded in SBCS
106/// * `decoding_table` - table for decoding SBCS (**with** undefined codepoints)
107///
108/// # Examples
109///
110/// ```
111/// use oem_cp::decode_string_incomplete_table_checked;
112/// use oem_cp::code_table::DECODING_TABLE_CP874;
113///
114/// // means shrimp in Thai (U+E49 => 0xE9)
115/// assert_eq!(decode_string_incomplete_table_checked(vec![0xA1, 0xD8, 0xE9, 0xA7], &DECODING_TABLE_CP874), Some("กุ้ง".to_string()));
116/// // 0xDB-0xDE,0xFC-0xFF is invalid in CP874 in Windows
117/// assert_eq!(decode_string_incomplete_table_checked(vec![0x30, 0xDB], &DECODING_TABLE_CP874), None);
118/// ```
119pub fn decode_string_incomplete_table_checked<'a, T: Into<Cow<'a, [u8]>>>(
120    src: T,
121    decoding_table: &[Option<char>; 128],
122) -> Option<String> {
123    let mut ret = String::new();
124    for byte in src.into().iter() {
125        ret.push(if *byte < 128 {
126            *byte as char
127        } else {
128            decoding_table[(*byte & 127) as usize]?
129        });
130    }
131    Some(ret)
132}
133
134/// Decode SBCS (single byte character set) bytes (with undefined codepoints)
135///
136/// Undefined codepoints are replaced with `U+FFFD` (replacement character).
137///
138/// # Arguments
139///
140/// * `src` - bytes encoded in SBCS
141/// * `decoding_table` - table for decoding SBCS (**with** undefined codepoints)
142///
143/// # Examples
144///
145/// ```
146/// use oem_cp::decode_string_incomplete_table_lossy;
147/// use oem_cp::code_table::DECODING_TABLE_CP874;
148///
149/// // means shrimp in Thai (U+E49 => 0xE9)
150/// assert_eq!(&decode_string_incomplete_table_lossy(vec![0xA1, 0xD8, 0xE9, 0xA7], &DECODING_TABLE_CP874), "กุ้ง");
151/// // 0xDB-0xDE,0xFC-0xFF is invalid in CP874 in Windows
152/// assert_eq!(&decode_string_incomplete_table_lossy(vec![0x30, 0xDB], &DECODING_TABLE_CP874), "0\u{FFFD}");
153/// ```
154pub fn decode_string_incomplete_table_lossy<'a, T: Into<Cow<'a, [u8]>>>(
155    src: T,
156    decoding_table: &[Option<char>; 128],
157) -> String {
158    src.into()
159        .iter()
160        .map(|byte| {
161            if *byte < 128 {
162                *byte as char
163            } else {
164                decoding_table[(*byte & 127) as usize].unwrap_or('\u{FFFD}')
165            }
166        })
167        .collect()
168}
169
170/// Encode Unicode string in SBCS (single byte character set)
171///
172/// If some undefined codepoints are found, returns `None`.
173///
174/// # Arguments
175///
176/// * `src` - Unicode string
177/// * `encoding_table` - table for encoding in SBCS
178///
179/// # Examples
180///
181/// ```
182/// use oem_cp::encode_string_checked;
183/// use oem_cp::code_table::{ENCODING_TABLE_CP437, ENCODING_TABLE_CP737};
184/// assert_eq!(encode_string_checked("π≈22/7", &ENCODING_TABLE_CP437), Some(vec![0xE3, 0xF7, 0x32, 0x32, 0x2F, 0x37]));
185/// // Archimedes in Greek
186/// assert_eq!(encode_string_checked("Αρχιμήδης", &ENCODING_TABLE_CP737), Some(vec![0x80, 0xA8, 0xAE, 0xA0, 0xA3, 0xE3, 0x9B, 0x9E, 0xAA]));
187/// // Japanese characters are not defined in CP437
188/// assert_eq!(encode_string_checked("日本語ja_jp", &ENCODING_TABLE_CP437), None);
189/// ```
190pub fn encode_string_checked<'a, T: Into<Cow<'a, str>>>(
191    src: T,
192    encoding_table: &OEMCPHashMap<char, u8>,
193) -> Option<Vec<u8>> {
194    let mut ret = Vec::new();
195    for c in src.into().chars() {
196        ret.push(if (c as u32) < 128 {
197            c as u8
198        } else {
199            *encoding_table.get(&c)?
200        });
201    }
202    Some(ret)
203}
204
205/// Encode Unicode string in SBCS (single byte character set)
206///
207/// Undefined codepoints are replaced with `0x3F` (`?`).
208///
209/// # Arguments
210///
211/// * `src` - Unicode string
212/// * `encoding_table` - table for encoding in SBCS
213///
214/// # Examples
215///
216/// ```
217/// use oem_cp::encode_string_lossy;
218/// use oem_cp::code_table::{ENCODING_TABLE_CP437, ENCODING_TABLE_CP737};
219/// assert_eq!(encode_string_lossy("π≈22/7", &ENCODING_TABLE_CP437), vec![0xE3, 0xF7, 0x32, 0x32, 0x2F, 0x37]);
220/// // Archimedes in Greek
221/// assert_eq!(encode_string_lossy("Αρχιμήδης", &ENCODING_TABLE_CP737), vec![0x80, 0xA8, 0xAE, 0xA0, 0xA3, 0xE3, 0x9B, 0x9E, 0xAA]);
222/// // Japanese characters are not defined in CP437 and replaced with `?` (0x3F)
223/// // "日本語ja_jp" => "???ja_jp"
224/// assert_eq!(encode_string_lossy("日本語ja_jp", &ENCODING_TABLE_CP437), vec![0x3F, 0x3F, 0x3F, 0x6A, 0x61, 0x5F, 0x6A, 0x70]);
225/// ```
226pub fn encode_string_lossy<'a, T: Into<Cow<'a, str>>>(
227    src: T,
228    encoding_table: &OEMCPHashMap<char, u8>,
229) -> Vec<u8> {
230    src.into()
231        .chars()
232        .map(|c| {
233            if (c as u32) < 128 {
234                c as u8
235            } else {
236                encoding_table.get(&c).copied().unwrap_or(b'?')
237            }
238        })
239        .collect()
240}
241
242#[cfg(test)]
243mod tests {
244    use super::*;
245    use crate::code_table::*;
246    use once_cell::sync::Lazy;
247
248    static CP437_VALID_PAIRS: Lazy<Vec<(&'static str, Vec<u8>)>> = Lazy::new(|| {
249        vec![
250            ("√α²±ß²", vec![0xFB, 0xE0, 0xFD, 0xF1, 0xE1, 0xFD]),
251            ("és", vec![0x82, 0x73]),
252            ("più", vec![0x70, 0x69, 0x97]),
253            ("½÷¼=2", vec![0xAB, 0xF6, 0xAC, 0x3D, 0x32]),
254        ]
255    });
256    static CP874_VALID_PAIRS: Lazy<Vec<(&'static str, Vec<u8>)>> = Lazy::new(|| {
257        vec![
258            // cspell: disable
259            (
260                "ราชอาณาจักรไท",
261                vec![
262                    0xC3, 0xD2, 0xAA, 0xCD, 0xD2, 0xB3, 0xD2, 0xA8, 0xD1, 0xA1, 0xC3, 0xE4, 0xB7,
263                ],
264            ),
265            (
266                "ต้มยำกุ้ง",
267                vec![0xB5, 0xE9, 0xC1, 0xC2, 0xD3, 0xA1, 0xD8, 0xE9, 0xA7],
268            ),
269            // cspell: enable
270        ]
271    });
272    static CP857_VALID_PAIRS: Lazy<Vec<(&'static str, Vec<u8>)>> = Lazy::new(|| {
273        vec![
274            // cspell: disable
275            ("½÷¼=2", vec![0xAB, 0xF6, 0xAC, 0x3D, 0x32]),
276            ("¼×3=¾", vec![0xAC, 0xE8, 0x33, 0x3D, 0xF3]),
277            ("İran", vec![0x98, 0x72, 0x61, 0x6E]),
278            ("ırmak", vec![0x8D, 0x72, 0x6D, 0x61, 0x6B]),
279            ("iş", vec![0x69, 0x9F]),
280            // cspell: enable
281        ]
282    });
283    /// OEM SBCSs used in some languages (locales)
284    static WINDOWS_USED_CODEPAGES: Lazy<Vec<u16>> = Lazy::new(|| {
285        vec![
286            437, // 720, // TODO: implement for locales using Arabic alphabets
287            737, 775, 850, 852, 855, 857, 862, 866, 874,
288        ]
289    });
290    #[allow(clippy::type_complexity)]
291    static WINDOWS_CONVERSION_VALID_TESTCASES: Lazy<Vec<(u16, Vec<(u8, char)>)>> =
292        Lazy::new(|| {
293            vec![
294                (437, vec![(0x82, 'é'), (0x9D, '¥'), (0xFB, '√')]),
295                (850, vec![(0xD0, 'ð'), (0xF3, '¾'), (0x9E, '×')]),
296                (874, vec![(0x80, '€'), (0xDF, '฿'), (0xA1, 'ก')]),
297            ]
298        });
299    #[test]
300    fn cp437_encoding_test() {
301        for (utf8_ref, cp437_ref) in &*CP437_VALID_PAIRS {
302            assert_eq!(
303                &encode_string_lossy(*utf8_ref, &ENCODING_TABLE_CP437),
304                cp437_ref
305            );
306            assert_eq!(
307                &(encode_string_checked(*utf8_ref, &ENCODING_TABLE_CP437).unwrap()),
308                cp437_ref
309            );
310        }
311    }
312    #[test]
313    fn cp437_decoding_test() {
314        for (utf8_ref, cp437_ref) in &*CP437_VALID_PAIRS {
315            assert_eq!(
316                &decode_string_complete_table(cp437_ref, &DECODING_TABLE_CP437),
317                *utf8_ref
318            );
319        }
320    }
321    #[test]
322    fn cp874_encoding_test() {
323        for (utf8_ref, cp874_ref) in &*CP874_VALID_PAIRS {
324            assert_eq!(
325                &encode_string_lossy(*utf8_ref, &ENCODING_TABLE_CP874),
326                cp874_ref
327            );
328            assert_eq!(
329                &(encode_string_checked(*utf8_ref, &ENCODING_TABLE_CP874).unwrap()),
330                cp874_ref
331            );
332        }
333    }
334    #[test]
335    fn cp874_decoding_test() {
336        for (utf8_ref, cp874_ref) in &*CP874_VALID_PAIRS {
337            assert_eq!(
338                &decode_string_incomplete_table_lossy(cp874_ref, &DECODING_TABLE_CP874),
339                *utf8_ref
340            );
341            assert_eq!(
342                &*(decode_string_incomplete_table_checked(cp874_ref, &DECODING_TABLE_CP874)
343                    .unwrap_or_else(|| panic!(
344                        "{cp874_ref:?} (intended for {utf8_ref:?}) is not a valid cp874 bytes."
345                    ))),
346                *utf8_ref
347            );
348        }
349    }
350    #[test]
351    fn cp857_encoding_test() {
352        for (utf8_ref, cp857_ref) in &*CP857_VALID_PAIRS {
353            assert_eq!(
354                &encode_string_lossy(*utf8_ref, &ENCODING_TABLE_CP857),
355                cp857_ref
356            );
357            assert_eq!(
358                &(encode_string_checked(*utf8_ref, &ENCODING_TABLE_CP857).unwrap()),
359                cp857_ref
360            );
361        }
362    }
363    #[test]
364    fn cp857_decoding_test() {
365        for (utf8_ref, cp857_ref) in &*CP857_VALID_PAIRS {
366            assert_eq!(
367                &decode_string_incomplete_table_lossy(cp857_ref, &DECODING_TABLE_CP857),
368                *utf8_ref
369            );
370            assert_eq!(
371                &*(decode_string_incomplete_table_checked(cp857_ref, &DECODING_TABLE_CP857)
372                    .unwrap_or_else(|| panic!(
373                        "{cp857_ref:?} (intended for {utf8_ref:?}) is not a valid cp857 bytes."
374                    ))),
375                *utf8_ref
376            );
377        }
378    }
379
380    #[test]
381    fn windows_codepages_coverage_test() {
382        for cp in &*WINDOWS_USED_CODEPAGES {
383            assert!(
384                ENCODING_TABLE_CP_MAP.get(cp).is_some(),
385                "Encoding table for cp{cp} is not defined",
386            );
387            assert!(
388                DECODING_TABLE_CP_MAP.get(cp).is_some(),
389                "Decoding table for cp{cp} is not defined",
390            );
391        }
392    }
393
394    /// Convert codepoint to Unicode via WindowsAPI
395    ///
396    /// # Arguments
397    ///
398    /// * `byte` - code point to convert to Unicode
399    /// * `codepage` - code page
400    #[cfg(windows)]
401    fn windows_to_unicode_char(byte: u8, codepage: u16) -> Option<char> {
402        let input_buf = [byte];
403        let mut win_decode_buf: Vec<u16>;
404        unsafe {
405            use std::ptr::null_mut;
406            use winapi::shared::winerror::ERROR_NO_UNICODE_TRANSLATION;
407            use winapi::um::errhandlingapi::GetLastError;
408            use winapi::um::stringapiset::MultiByteToWideChar;
409            use winapi::um::winnls::MB_ERR_INVALID_CHARS;
410            let win_decode_len = MultiByteToWideChar(
411                codepage as u32,
412                MB_ERR_INVALID_CHARS,
413                input_buf.as_ptr() as *const i8,
414                1,
415                null_mut(),
416                0,
417            );
418            if win_decode_len <= 0 {
419                if GetLastError() == ERROR_NO_UNICODE_TRANSLATION {
420                    return None;
421                }
422                panic!("MultiByteToWideChar (size checking) for 0x{byte:X} failed in cp{codepage}");
423            }
424            win_decode_buf = vec![0; win_decode_len as usize];
425            let win_decode_status = MultiByteToWideChar(
426                codepage as u32,
427                MB_ERR_INVALID_CHARS,
428                input_buf.as_ptr() as *const i8,
429                1,
430                win_decode_buf.as_mut_ptr(),
431                win_decode_len,
432            );
433            assert_eq!(
434                win_decode_status, win_decode_len,
435                "MultiByteToWideChar (writing) failed for 0x{byte:X} in cp{codepage} (size checking returned {win_decode_len} / writing returned {win_decode_status})"
436            );
437        }
438        let string_buf = String::from_utf16(&win_decode_buf).unwrap();
439        if string_buf.chars().count() != 1 {
440            return None;
441        }
442        return Some(string_buf.chars().next().unwrap());
443    }
444
445    #[cfg(windows)]
446    #[test]
447    fn windows_to_unicode_char_test() {
448        static WINDOWS_CONVERSION_INVALID_TESTCASES: Lazy<Vec<(u16, Vec<u8>)>> = Lazy::new(|| {
449            vec![
450                (857, vec![0xE7, 0xF2]),
451                (874, vec![0xDB, 0xDC, 0xDD, 0xDE, 0xFC, 0xFD, 0xFE, 0xFF]),
452            ]
453        });
454        use itertools::join;
455        for (codepage, testcases) in &*WINDOWS_CONVERSION_VALID_TESTCASES {
456            let result = testcases
457                .iter()
458                .map(|(source, _)| windows_to_unicode_char(*source, *codepage))
459                .collect::<Vec<Option<char>>>();
460            assert!(
461                testcases
462                    .iter()
463                    .zip(result.iter())
464                    .all(|((_, target), converted)| converted
465                        .map(|c| c == *target)
466                        .unwrap_or(false)),
467                "failed in cp{}:\n{}",
468                codepage,
469                join(
470                    testcases
471                        .iter()
472                        .zip(result.iter())
473                        .filter(|((_, target), converted)| converted
474                            .map(|c| c != *target)
475                            .unwrap_or(true))
476                        .map(|((from, target), converted)| format!(
477                            "0x{from:X} => {target:?} (target) / {converted:?} (Windows)"
478                        )),
479                    ", "
480                )
481            );
482        }
483        for (codepage, testcases) in &*WINDOWS_CONVERSION_INVALID_TESTCASES {
484            let result = testcases
485                .iter()
486                .map(|source| windows_to_unicode_char(*source, *codepage))
487                .collect::<Vec<Option<char>>>();
488            assert!(
489                result.iter().all(|r| r.is_none()),
490                "Some codepoints in cp{} weren't None: {}",
491                codepage,
492                join(
493                    testcases
494                        .iter()
495                        .zip(result.iter())
496                        .filter(|(_, r)| r.is_some())
497                        .map(|(t, r)| format!("0x{:X} => {:?}", t, r.unwrap())),
498                    ", "
499                )
500            );
501        }
502    }
503
504    #[cfg(windows)]
505    #[test]
506    fn compare_to_winapi_decoding_test() {
507        let windows_testing_codepages: Vec<(u16, Option<Vec<std::ops::Range<u8>>>)> = vec![
508            (437, None),
509            // (720, None),
510            (737, None),
511            (775, None),
512            (850, None),
513            (852, None),
514            (855, None),
515            (857, None),
516            (862, None),
517            (866, None),
518            // CP437 is broken in Windows (0x81-0x84,0x86-0x90,0x98-A0 are mapped to U+XX as are, but they must be undefined)
519            (874, None),
520        ];
521        use std::borrow::Cow;
522        let default_range = Cow::from(vec![(128..255).collect::<Vec<u8>>()]);
523        use itertools::join;
524        for (codepage, testing_ranges) in &*windows_testing_codepages {
525            let testing_ranges = testing_ranges
526                .as_ref()
527                .map(|v| {
528                    Cow::from(
529                        v.iter()
530                            .map(|r| r.clone().collect::<Vec<u8>>())
531                            .collect::<Vec<Vec<u8>>>(),
532                    )
533                })
534                .unwrap_or(default_range.clone());
535            for testing in testing_ranges.as_ref() {
536                let msg = format!("Decoding table for cp{codepage} is not defined");
537                let library_result = DECODING_TABLE_CP_MAP
538                    .get(codepage)
539                    .expect(&msg)
540                    .decode_string_lossy(testing);
541                let windows_result = testing
542                    .iter()
543                    .map(|codepoint| {
544                        windows_to_unicode_char(*codepoint, *codepage)
545                            .and_then(|ch| {
546                                if 0xE000 <= ch as u32 && ch as u32 <= 0xF8FF {
547                                    None
548                                } else {
549                                    Some(ch)
550                                }
551                            })
552                            .unwrap_or('\u{FFFD}')
553                    })
554                    .collect::<String>();
555                assert_eq!(
556                    library_result,
557                    windows_result,
558                    "Different in cp{}:\n {}",
559                    codepage,
560                    join(
561                        testing
562                            .iter()
563                            .zip(library_result.chars().zip(windows_result.chars()))
564                            .filter(|(_, (l, w))| l != w)
565                            .map(|(from, (lib, win))| format!(
566                                "0x{from:X} => {lib:?} (library) != {win:?} (Windows)"
567                            )),
568                        ", "
569                    )
570                );
571            }
572        }
573    }
574}