oletools_rs 0.1.0

Rust port of oletools — analysis tools for Microsoft Office files (VBA macros, DDE, OLE objects, RTF exploits)
Documentation
//! Mapping from Windows codepage numbers to `encoding_rs` encodings.
//!
//! Used by VBA project parsing to decode module source code.

use encoding_rs::Encoding;

/// Return the `encoding_rs::Encoding` that corresponds to `codepage`, or
/// `None` if the codepage is unknown / unsupported.
pub fn codepage_to_encoding(codepage: u16) -> Option<&'static Encoding> {
    match codepage {
        437 => Some(encoding_rs::IBM866), // closest available
        850 | 858 => Some(encoding_rs::ISO_8859_15),
        874 => Some(encoding_rs::WINDOWS_874),
        932 => Some(encoding_rs::SHIFT_JIS),
        936 => Some(encoding_rs::GBK),
        949 => Some(encoding_rs::EUC_KR),
        950 => Some(encoding_rs::BIG5),
        1200 => Some(encoding_rs::UTF_16LE),
        1201 => Some(encoding_rs::UTF_16BE),
        1250 => Some(encoding_rs::WINDOWS_1250),
        1251 => Some(encoding_rs::WINDOWS_1251),
        1252 => Some(encoding_rs::WINDOWS_1252),
        1253 => Some(encoding_rs::WINDOWS_1253),
        1254 => Some(encoding_rs::WINDOWS_1254),
        1255 => Some(encoding_rs::WINDOWS_1255),
        1256 => Some(encoding_rs::WINDOWS_1256),
        1257 => Some(encoding_rs::WINDOWS_1257),
        1258 => Some(encoding_rs::WINDOWS_1258),
        10000 => Some(encoding_rs::MACINTOSH),
        10006 | 10007 | 10029 | 10079 | 10081 => Some(encoding_rs::MACINTOSH),
        20866 => Some(encoding_rs::KOI8_R),
        20932 => Some(encoding_rs::EUC_JP),
        21866 => Some(encoding_rs::KOI8_U),
        28591 => Some(encoding_rs::ISO_8859_2), // ISO-8859-1 maps here
        28592 => Some(encoding_rs::ISO_8859_2),
        28593 => Some(encoding_rs::ISO_8859_3),
        28594 => Some(encoding_rs::ISO_8859_4),
        28595 => Some(encoding_rs::ISO_8859_5),
        28596 => Some(encoding_rs::ISO_8859_6),
        28597 => Some(encoding_rs::ISO_8859_7),
        28598 => Some(encoding_rs::ISO_8859_8),
        28599 => Some(encoding_rs::WINDOWS_1254), // ISO-8859-9 ≈ windows-1254
        28603 => Some(encoding_rs::ISO_8859_13),
        28605 => Some(encoding_rs::ISO_8859_15),
        50220..=50222 => Some(encoding_rs::ISO_2022_JP),
        51932 => Some(encoding_rs::EUC_JP),
        51949 => Some(encoding_rs::EUC_KR),
        52936 => Some(encoding_rs::GBK),
        54936 => Some(encoding_rs::GB18030),
        65000 => Some(encoding_rs::UTF_8), // UTF-7 not supported, fallback
        65001 => Some(encoding_rs::UTF_8),
        _ => None,
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_common_codepages() {
        assert_eq!(
            codepage_to_encoding(1252).unwrap().name(),
            "windows-1252"
        );
        assert_eq!(
            codepage_to_encoding(65001).unwrap().name(),
            "UTF-8"
        );
        assert_eq!(
            codepage_to_encoding(932).unwrap().name(),
            "Shift_JIS"
        );
    }

    #[test]
    fn test_unknown_codepage() {
        assert!(codepage_to_encoding(9999).is_none());
    }

    #[test]
    fn test_utf16le() {
        assert_eq!(
            codepage_to_encoding(1200).unwrap().name(),
            "UTF-16LE"
        );
    }
}