1use alloc::borrow::Cow;
2use alloc::string::String;
3use alloc::vec::Vec;
4use core::convert::Into;
5
6use super::code_table_type::TableType;
7use super::OEMCPHashMap;
8
9use TableType::*;
10
11impl TableType {
12 pub fn decode_string_checked<'a, T: Into<Cow<'a, [u8]>>>(&self, src: T) -> Option<String> {
34 match self {
35 Complete(table_ref) => Some(decode_string_complete_table(src, table_ref)),
36 Incomplete(table_ref) => decode_string_incomplete_table_checked(src, table_ref),
37 }
38 }
39 pub fn decode_string_lossy<'a, T: Into<Cow<'a, [u8]>>>(&self, src: T) -> String {
61 match self {
62 Complete(table_ref) => decode_string_complete_table(src, table_ref),
63 Incomplete(table_ref) => decode_string_incomplete_table_lossy(src, table_ref),
64 }
65 }
66}
67
68pub fn decode_string_complete_table<'a, T: Into<Cow<'a, [u8]>>>(
84 src: T,
85 decoding_table: &[char; 128],
86) -> String {
87 src.into()
88 .iter()
89 .map(|byte| {
90 if *byte < 128 {
91 *byte as char
92 } else {
93 decoding_table[(*byte & 127) as usize]
94 }
95 })
96 .collect()
97}
98
99pub fn decode_string_incomplete_table_checked<'a, T: Into<Cow<'a, [u8]>>>(
120 src: T,
121 decoding_table: &[Option<char>; 128],
122) -> Option<String> {
123 let mut ret = String::new();
124 for byte in src.into().iter() {
125 ret.push(if *byte < 128 {
126 *byte as char
127 } else {
128 decoding_table[(*byte & 127) as usize]?
129 });
130 }
131 Some(ret)
132}
133
134pub fn decode_string_incomplete_table_lossy<'a, T: Into<Cow<'a, [u8]>>>(
155 src: T,
156 decoding_table: &[Option<char>; 128],
157) -> String {
158 src.into()
159 .iter()
160 .map(|byte| {
161 if *byte < 128 {
162 *byte as char
163 } else {
164 decoding_table[(*byte & 127) as usize].unwrap_or('\u{FFFD}')
165 }
166 })
167 .collect()
168}
169
170pub fn encode_string_checked<'a, T: Into<Cow<'a, str>>>(
191 src: T,
192 encoding_table: &OEMCPHashMap<char, u8>,
193) -> Option<Vec<u8>> {
194 let mut ret = Vec::new();
195 for c in src.into().chars() {
196 ret.push(if (c as u32) < 128 {
197 c as u8
198 } else {
199 *encoding_table.get(&c)?
200 });
201 }
202 Some(ret)
203}
204
205pub fn encode_string_lossy<'a, T: Into<Cow<'a, str>>>(
227 src: T,
228 encoding_table: &OEMCPHashMap<char, u8>,
229) -> Vec<u8> {
230 src.into()
231 .chars()
232 .map(|c| {
233 if (c as u32) < 128 {
234 c as u8
235 } else {
236 encoding_table.get(&c).copied().unwrap_or(b'?')
237 }
238 })
239 .collect()
240}
241
242#[cfg(test)]
243mod tests {
244 use super::*;
245 use crate::code_table::*;
246 use once_cell::sync::Lazy;
247
248 static CP437_VALID_PAIRS: Lazy<Vec<(&'static str, Vec<u8>)>> = Lazy::new(|| {
249 vec![
250 ("√α²±ß²", vec![0xFB, 0xE0, 0xFD, 0xF1, 0xE1, 0xFD]),
251 ("és", vec![0x82, 0x73]),
252 ("più", vec![0x70, 0x69, 0x97]),
253 ("½÷¼=2", vec![0xAB, 0xF6, 0xAC, 0x3D, 0x32]),
254 ]
255 });
256 static CP874_VALID_PAIRS: Lazy<Vec<(&'static str, Vec<u8>)>> = Lazy::new(|| {
257 vec![
258 (
260 "ราชอาณาจักรไท",
261 vec![
262 0xC3, 0xD2, 0xAA, 0xCD, 0xD2, 0xB3, 0xD2, 0xA8, 0xD1, 0xA1, 0xC3, 0xE4, 0xB7,
263 ],
264 ),
265 (
266 "ต้มยำกุ้ง",
267 vec![0xB5, 0xE9, 0xC1, 0xC2, 0xD3, 0xA1, 0xD8, 0xE9, 0xA7],
268 ),
269 ]
271 });
272 static CP857_VALID_PAIRS: Lazy<Vec<(&'static str, Vec<u8>)>> = Lazy::new(|| {
273 vec![
274 ("½÷¼=2", vec![0xAB, 0xF6, 0xAC, 0x3D, 0x32]),
276 ("¼×3=¾", vec![0xAC, 0xE8, 0x33, 0x3D, 0xF3]),
277 ("İran", vec![0x98, 0x72, 0x61, 0x6E]),
278 ("ırmak", vec![0x8D, 0x72, 0x6D, 0x61, 0x6B]),
279 ("iş", vec![0x69, 0x9F]),
280 ]
282 });
283 static WINDOWS_USED_CODEPAGES: Lazy<Vec<u16>> = Lazy::new(|| {
285 vec![
286 437, 737, 775, 850, 852, 855, 857, 862, 866, 874,
288 ]
289 });
290 #[allow(clippy::type_complexity)]
291 static WINDOWS_CONVERSION_VALID_TESTCASES: Lazy<Vec<(u16, Vec<(u8, char)>)>> =
292 Lazy::new(|| {
293 vec![
294 (437, vec![(0x82, 'é'), (0x9D, '¥'), (0xFB, '√')]),
295 (850, vec![(0xD0, 'ð'), (0xF3, '¾'), (0x9E, '×')]),
296 (874, vec![(0x80, '€'), (0xDF, '฿'), (0xA1, 'ก')]),
297 ]
298 });
299 #[test]
300 fn cp437_encoding_test() {
301 for (utf8_ref, cp437_ref) in &*CP437_VALID_PAIRS {
302 assert_eq!(
303 &encode_string_lossy(*utf8_ref, &ENCODING_TABLE_CP437),
304 cp437_ref
305 );
306 assert_eq!(
307 &(encode_string_checked(*utf8_ref, &ENCODING_TABLE_CP437).unwrap()),
308 cp437_ref
309 );
310 }
311 }
312 #[test]
313 fn cp437_decoding_test() {
314 for (utf8_ref, cp437_ref) in &*CP437_VALID_PAIRS {
315 assert_eq!(
316 &decode_string_complete_table(cp437_ref, &DECODING_TABLE_CP437),
317 *utf8_ref
318 );
319 }
320 }
321 #[test]
322 fn cp874_encoding_test() {
323 for (utf8_ref, cp874_ref) in &*CP874_VALID_PAIRS {
324 assert_eq!(
325 &encode_string_lossy(*utf8_ref, &ENCODING_TABLE_CP874),
326 cp874_ref
327 );
328 assert_eq!(
329 &(encode_string_checked(*utf8_ref, &ENCODING_TABLE_CP874).unwrap()),
330 cp874_ref
331 );
332 }
333 }
334 #[test]
335 fn cp874_decoding_test() {
336 for (utf8_ref, cp874_ref) in &*CP874_VALID_PAIRS {
337 assert_eq!(
338 &decode_string_incomplete_table_lossy(cp874_ref, &DECODING_TABLE_CP874),
339 *utf8_ref
340 );
341 assert_eq!(
342 &*(decode_string_incomplete_table_checked(cp874_ref, &DECODING_TABLE_CP874)
343 .unwrap_or_else(|| panic!(
344 "{cp874_ref:?} (intended for {utf8_ref:?}) is not a valid cp874 bytes."
345 ))),
346 *utf8_ref
347 );
348 }
349 }
350 #[test]
351 fn cp857_encoding_test() {
352 for (utf8_ref, cp857_ref) in &*CP857_VALID_PAIRS {
353 assert_eq!(
354 &encode_string_lossy(*utf8_ref, &ENCODING_TABLE_CP857),
355 cp857_ref
356 );
357 assert_eq!(
358 &(encode_string_checked(*utf8_ref, &ENCODING_TABLE_CP857).unwrap()),
359 cp857_ref
360 );
361 }
362 }
363 #[test]
364 fn cp857_decoding_test() {
365 for (utf8_ref, cp857_ref) in &*CP857_VALID_PAIRS {
366 assert_eq!(
367 &decode_string_incomplete_table_lossy(cp857_ref, &DECODING_TABLE_CP857),
368 *utf8_ref
369 );
370 assert_eq!(
371 &*(decode_string_incomplete_table_checked(cp857_ref, &DECODING_TABLE_CP857)
372 .unwrap_or_else(|| panic!(
373 "{cp857_ref:?} (intended for {utf8_ref:?}) is not a valid cp857 bytes."
374 ))),
375 *utf8_ref
376 );
377 }
378 }
379
380 #[test]
381 fn windows_codepages_coverage_test() {
382 for cp in &*WINDOWS_USED_CODEPAGES {
383 assert!(
384 ENCODING_TABLE_CP_MAP.get(cp).is_some(),
385 "Encoding table for cp{cp} is not defined",
386 );
387 assert!(
388 DECODING_TABLE_CP_MAP.get(cp).is_some(),
389 "Decoding table for cp{cp} is not defined",
390 );
391 }
392 }
393
394 #[cfg(windows)]
401 fn windows_to_unicode_char(byte: u8, codepage: u16) -> Option<char> {
402 let input_buf = [byte];
403 let mut win_decode_buf: Vec<u16>;
404 unsafe {
405 use std::ptr::null_mut;
406 use winapi::shared::winerror::ERROR_NO_UNICODE_TRANSLATION;
407 use winapi::um::errhandlingapi::GetLastError;
408 use winapi::um::stringapiset::MultiByteToWideChar;
409 use winapi::um::winnls::MB_ERR_INVALID_CHARS;
410 let win_decode_len = MultiByteToWideChar(
411 codepage as u32,
412 MB_ERR_INVALID_CHARS,
413 input_buf.as_ptr() as *const i8,
414 1,
415 null_mut(),
416 0,
417 );
418 if win_decode_len <= 0 {
419 if GetLastError() == ERROR_NO_UNICODE_TRANSLATION {
420 return None;
421 }
422 panic!("MultiByteToWideChar (size checking) for 0x{byte:X} failed in cp{codepage}");
423 }
424 win_decode_buf = vec![0; win_decode_len as usize];
425 let win_decode_status = MultiByteToWideChar(
426 codepage as u32,
427 MB_ERR_INVALID_CHARS,
428 input_buf.as_ptr() as *const i8,
429 1,
430 win_decode_buf.as_mut_ptr(),
431 win_decode_len,
432 );
433 assert_eq!(
434 win_decode_status, win_decode_len,
435 "MultiByteToWideChar (writing) failed for 0x{byte:X} in cp{codepage} (size checking returned {win_decode_len} / writing returned {win_decode_status})"
436 );
437 }
438 let string_buf = String::from_utf16(&win_decode_buf).unwrap();
439 if string_buf.chars().count() != 1 {
440 return None;
441 }
442 return Some(string_buf.chars().next().unwrap());
443 }
444
445 #[cfg(windows)]
446 #[test]
447 fn windows_to_unicode_char_test() {
448 static WINDOWS_CONVERSION_INVALID_TESTCASES: Lazy<Vec<(u16, Vec<u8>)>> = Lazy::new(|| {
449 vec![
450 (857, vec![0xE7, 0xF2]),
451 (874, vec![0xDB, 0xDC, 0xDD, 0xDE, 0xFC, 0xFD, 0xFE, 0xFF]),
452 ]
453 });
454 use itertools::join;
455 for (codepage, testcases) in &*WINDOWS_CONVERSION_VALID_TESTCASES {
456 let result = testcases
457 .iter()
458 .map(|(source, _)| windows_to_unicode_char(*source, *codepage))
459 .collect::<Vec<Option<char>>>();
460 assert!(
461 testcases
462 .iter()
463 .zip(result.iter())
464 .all(|((_, target), converted)| converted
465 .map(|c| c == *target)
466 .unwrap_or(false)),
467 "failed in cp{}:\n{}",
468 codepage,
469 join(
470 testcases
471 .iter()
472 .zip(result.iter())
473 .filter(|((_, target), converted)| converted
474 .map(|c| c != *target)
475 .unwrap_or(true))
476 .map(|((from, target), converted)| format!(
477 "0x{from:X} => {target:?} (target) / {converted:?} (Windows)"
478 )),
479 ", "
480 )
481 );
482 }
483 for (codepage, testcases) in &*WINDOWS_CONVERSION_INVALID_TESTCASES {
484 let result = testcases
485 .iter()
486 .map(|source| windows_to_unicode_char(*source, *codepage))
487 .collect::<Vec<Option<char>>>();
488 assert!(
489 result.iter().all(|r| r.is_none()),
490 "Some codepoints in cp{} weren't None: {}",
491 codepage,
492 join(
493 testcases
494 .iter()
495 .zip(result.iter())
496 .filter(|(_, r)| r.is_some())
497 .map(|(t, r)| format!("0x{:X} => {:?}", t, r.unwrap())),
498 ", "
499 )
500 );
501 }
502 }
503
504 #[cfg(windows)]
505 #[test]
506 fn compare_to_winapi_decoding_test() {
507 let windows_testing_codepages: Vec<(u16, Option<Vec<std::ops::Range<u8>>>)> = vec![
508 (437, None),
509 (737, None),
511 (775, None),
512 (850, None),
513 (852, None),
514 (855, None),
515 (857, None),
516 (862, None),
517 (866, None),
518 (874, None),
520 ];
521 use std::borrow::Cow;
522 let default_range = Cow::from(vec![(128..255).collect::<Vec<u8>>()]);
523 use itertools::join;
524 for (codepage, testing_ranges) in &*windows_testing_codepages {
525 let testing_ranges = testing_ranges
526 .as_ref()
527 .map(|v| {
528 Cow::from(
529 v.iter()
530 .map(|r| r.clone().collect::<Vec<u8>>())
531 .collect::<Vec<Vec<u8>>>(),
532 )
533 })
534 .unwrap_or(default_range.clone());
535 for testing in testing_ranges.as_ref() {
536 let msg = format!("Decoding table for cp{codepage} is not defined");
537 let library_result = DECODING_TABLE_CP_MAP
538 .get(codepage)
539 .expect(&msg)
540 .decode_string_lossy(testing);
541 let windows_result = testing
542 .iter()
543 .map(|codepoint| {
544 windows_to_unicode_char(*codepoint, *codepage)
545 .and_then(|ch| {
546 if 0xE000 <= ch as u32 && ch as u32 <= 0xF8FF {
547 None
548 } else {
549 Some(ch)
550 }
551 })
552 .unwrap_or('\u{FFFD}')
553 })
554 .collect::<String>();
555 assert_eq!(
556 library_result,
557 windows_result,
558 "Different in cp{}:\n {}",
559 codepage,
560 join(
561 testing
562 .iter()
563 .zip(library_result.chars().zip(windows_result.chars()))
564 .filter(|(_, (l, w))| l != w)
565 .map(|(from, (lib, win))| format!(
566 "0x{from:X} => {lib:?} (library) != {win:?} (Windows)"
567 )),
568 ", "
569 )
570 );
571 }
572 }
573 }
574}