oem_cp 0.2.0

Rust library that handles OEM code pages (e.g. CP{437,737,850}) for single byte character sets
Documentation
from typing import Optional, List, Dict
from io import TextIOBase
import re
import requests
from sys import stdout, argv
from pathlib import Path
import json


def convert_to_reverse_map(table: List[Optional[int]]) -> Dict[int, int]:
    ret = {}
    for cp_codepoint, u8_codepoint in enumerate(table):
        if u8_codepoint is None:
            continue
        ret[u8_codepoint] = cp_codepoint
    return ret


def print_header(timestamp: str, f: TextIOBase = stdout):
    print(
        f"""\
//! Code table
//! Generated at {timestamp}
use super::code_table_type::TableType;
use ahash::AHashMap;
use lazy_static::lazy_static;
use TableType::*;""",
        file=f,
    )


def print_codepage_table(
    codepage: int, table: List[Optional[int]], f: TextIOBase = stdout
):
    print(
        f"/// Decoding table (CP{codepage} to Unicode)\n"
        f"pub static DECODING_TABLE_CP{codepage}",
        file=f,
        end="",
    )
    if None in table:
        print(": [Option<char>; 128] = [", file=f)
        for codepoint in table[128:]:
            print(
                "    None,"
                if codepoint is None
                else f"    Some('\\u{{{codepoint:04X}}}'),",
                file=f,
            )
        print("];", file=f)
    else:
        count = 0
        print(": [char; 128] = [", file=f, end="")
        for codepoint in table[128:]:
            print("\n    " if count == 0 else " ", end="", file=f)
            print(f"'\\u{{{codepoint:04X}}}',", end="", file=f)
            if count == 7:
                count = 0
            else:
                count += 1
        print("\n];", file=f)


def print_reverse_map(reverse_map: Dict[int, Dict[int, int]], f: TextIOBase = stdout):
    print(
        """\
lazy_static! {""",
        file=f,
    )
    for codepage, m in reverse_map.items():
        print(
            f"    /// Encoding table (Unicode to CP{codepage})\n"
            f"    pub static ref ENCODING_TABLE_CP{codepage}"
            ": AHashMap<char, u8> = {\n"
            "        let mut m = AHashMap::new();",
            file=f,
        )
        for unicode, dest in m.items():
            if unicode == dest and unicode < 128:
                continue
            print(
                f"        m.insert('\\u{{{unicode:04X}}}', 0x{dest:02X});", file=f,
            )
        print(
            """\
        return m;
    };""",
            file=f,
        )
    print("}", file=f)


def print_codepage_table_map(
    table_map: Dict[int, List[Optional[int]]], f: TextIOBase = stdout
):
    print(
        """\
lazy_static! {
    /// map from codepage to decoding table
    ///
    /// `.get` returns `code_table_type::{Complete,Incomplete}`.
    ///
    /// * `Complete`: the decoding table doesn't have undefined mapping.
    /// * `Incomplete`:  it have some undefined mapping.
    ///
    /// This enumerate provides methods `decode_string_lossy` and `decode_string_checked`.
    /// The following examples show the use of them.  `if let Some(decoder) = *snip* decoder.decode_string_*snip*` is convenient for practical use.
    ///
    /// # Examples
    ///
    /// ```
    /// use oem_cp::code_table::{DECODING_TABLE_CP_MAP, DECODING_TABLE_CP437};
    /// use oem_cp::code_table_type::TableType::*;
    /// assert_eq!((*DECODING_TABLE_CP_MAP).get(&437).unwrap().decode_string_lossy(vec![0x31, 0xF6, 0xAB, 0x3D, 0x32]), "1÷½=2".to_string());
    /// if let Some(cp874_table) = (*DECODING_TABLE_CP_MAP).get(&874) {
    ///     // means shrimp in Thai (U+E49 => 0xE9)
    ///     assert_eq!(cp874_table.decode_string_checked(vec![0xA1, 0xD8, 0xE9, 0xA7]), Some("กุ้ง".to_string()));
    ///     // undefined mapping 0xDB for CP874 Windows dialect (strict mode with MB_ERR_INVALID_CHARS)
    ///     assert_eq!(cp874_table.decode_string_checked(vec![0xDB]), None);
    /// } else {
    ///     panic!("CP874 must be defined in DECODING_TABLE_CP_MAP");
    /// }
    /// ```
    pub static ref DECODING_TABLE_CP_MAP: AHashMap<u16, TableType> = {
        let mut map = AHashMap::new();""",
        file=f,
    )
    for (codepage, table) in table_map.items():
        print(
            f"        map.insert({codepage}, {'Incomplete' if None in table else 'Complete'}(&DECODING_TABLE_CP{codepage}));",
            file=f,
        )
    print(
        """\
        return map;
    };
}""",
        file=f,
    )


def print_codepage_reverse_map_table(
    reverse_map: Dict[int, Dict[int, int]], f: TextIOBase = stdout
):
    print(
        """\
lazy_static! {
    /// map from codepage to encoding table
    ///
    /// # Examples
    ///
    /// ```
    /// use oem_cp::code_table::{ENCODING_TABLE_CP_MAP, ENCODING_TABLE_CP437};
    /// assert_eq!((*ENCODING_TABLE_CP_MAP).get(&437), Some(&&*ENCODING_TABLE_CP437));
    /// // CP932 (Shift-JIS; Japanese MBCS) is unsupported
    /// assert_eq!((*ENCODING_TABLE_CP_MAP).get(&932), None);
    ///
    /// use oem_cp::encode_string_checked;
    ///
    /// if let Some(cp437_table) = (*ENCODING_TABLE_CP_MAP).get(&437) {
    ///     assert_eq!(encode_string_checked("π≈22/7", cp437_table), Some(vec![0xE3, 0xF7, 0x32, 0x32, 0x2F, 0x37]));
    /// } else {
    ///     panic!("CP437 must be registerd in ENCODING_TABLE_CP_MAP");
    /// }
    /// ```
    pub static ref ENCODING_TABLE_CP_MAP: AHashMap<u16, &'static AHashMap<char, u8>> = {
        let mut m = AHashMap::new();""",
        file=f,
    )
    for codepage, m in reverse_map.items():
        print(
            f"        m.insert({codepage}, &*ENCODING_TABLE_CP{codepage});", file=f,
        )
    print(
        """\
        return m;
    };
}""",
        file=f,
    )


if __name__ == "__main__":
    raw_json = {}
    with (Path(argv[0]).parent / "assets" / "code_tables.json").open(
        encoding="UTF-8", newline="\n"
    ) as f:
        raw_json = json.load(f)

    created_timestamp = raw_json["created"]
    table_map = {
        int(codepage_str): table for codepage_str, table in raw_json["tables"].items()
    }
    reverse_map = {
        codepage: convert_to_reverse_map(table_map[codepage])
        for codepage in table_map.keys()
    }

    with open("src/code_table.rs", "w", encoding="utf-8", newline="\n") as f:
        print_header(created_timestamp, f)
        for codepage in table_map.keys():
            table = table_map[codepage]
            print_codepage_table(codepage, table, f)
        print_reverse_map(reverse_map, f)
        print_codepage_table_map(table_map, f)
        print_codepage_reverse_map_table(reverse_map, f)