import re
from html.parser import HTMLParser
import requests
from sys import stdout
import json
from collections import deque
from datetime import datetime, timezone
class FetchingDecompositionHTMLParser(HTMLParser):
def __init__(self):
super().__init__()
self.in_td = False
self.in_p = False
self.encoding_dic = {}
self.decoding_dic = {}
self.overall_regex = re.compile(r"0x([0-9A-F]+)(?: 0x([0-9A-F]+))*")
self.one_regex = re.compile(r"0x([0-9A-F]+)")
self.char_to_be_composed = ""
def handle_starttag(self, tag, attrs):
if tag.lower() == "td":
self.in_td = True
if tag.lower() == "p" and self.in_td:
self.in_p = True
def handle_endtag(self, tag):
if tag.lower() == "td" and self.in_td:
self.in_td = False
if tag.lower() == "p" and self.in_p:
self.in_p = False
def handle_data(self, data):
if self.in_p and self.in_td:
overall_match = self.overall_regex.match(data)
if overall_match is not None:
codepoints = [chr(int(m[1],16)) for m in (self.one_regex.match(codepoint_str) for codepoint_str in data.split(" ")) if m is not None]
if len(codepoints) >= 2:
self.encoding_dic[self.char_to_be_composed] = "".join(codepoints)
self.decoding_dic.setdefault(codepoints[0], {"current": None, "next": {}})
d = self.decoding_dic[codepoints[0]]
for c in codepoints[1:]:
d["next"].setdefault(c, {"current": None, "next": {}})
d = d["next"][c]
d["current"] = self.char_to_be_composed
self.char_to_be_composed = ""
else:
self.char_to_be_composed = codepoints[0]
def print_pre(f=stdout):
print(f"""//! Definition of Unicode decomposition dictionaries
//!
//! Generated based on https://developer.apple.com/library/archive/technotes/tn/tn1150table.html
//! at {datetime.now(timezone.utc).isoformat(timespec="seconds")}
use super::reverse_tree::ReverseTreeNode;
use hashbrown::HashMap;
use lazy_static::lazy_static;
lazy_static! """ "{",file=f)
def print_post(f=stdout):
print("}", file=f)
def print_encoding_dic(obj,f=stdout):
print("""\
/// map from composed character (normal) to decomposed components (HFS+)
///
/// # Examples
///
/// ```ignore
/// assert_eq!((*MAP_TO_HFS).get(&'\\u{00E9}').unwrap(), "e\\u{0301}");
/// ```
pub static ref MAP_TO_HFS: HashMap<char, &'static str> = {
let mut map = HashMap::new();""",file=f)
for (compose, decompose) in obj.items():
print(f" map.insert('\\u{{{ord(compose):04X}}}', \"" + "".join((f"\\u{{{ord(c):04X}}}" for c in decompose)) + "\");",file=f)
print(" return map;\n };",file=f)
def _print_de_(dic, var_name, f=stdout):
print(f" let mut {var_name} = HashMap::new();", file=f)
for char, result_obj in dic.items():
char_hexcode = f"{ord(char):04x}"
current = f"""Some('\\u{{{ord(result_obj["current"]):04X}}}')""" if ("current" in result_obj and result_obj["current"]) else "None"
if result_obj["next"]:
new_var_name = f"u{char_hexcode}" if var_name == "root" else f"{var_name}_{char_hexcode}"
_print_de_(result_obj["next"], new_var_name, f)
print(f" {var_name}.insert('\\u{{{char_hexcode.upper()}}}', ReverseTreeNode::new({current}, Some(Box::new({new_var_name}))));",file=f)
else:
print(f" {var_name}.insert('\\u{{{char_hexcode.upper()}}}', ReverseTreeNode::new({current}, None));",file=f)
def print_decoding_dic(dic,f=stdout):
print("""\
/// Dictionary (map) from decomposed components to sub dictionaries and composed characters
///
/// # Examples
///
/// ```ignore
/// assert_eq!((*MAP_TO_NORMAL).get(&'e').unwrap().next.unwrap().get(&'\\u{0301}').unwrap().current.unwrap(), '\\u{00E9}');
/// ```
pub static ref MAP_TO_NORMAL: HashMap<char, ReverseTreeNode> = {""",file=f)
_print_de_(dic, "root", f)
print(" return root;\n };",file=f)
if __name__ == "__main__":
parser = FetchingDecompositionHTMLParser()
req = requests.get("https://developer.apple.com/library/archive/technotes/tn/tn1150table.html")
parser.feed(req.text)
with open("src/code_table.rs", "w") as f:
print_pre(f)
print_encoding_dic(parser.encoding_dic, f)
print_decoding_dic(parser.decoding_dic, f)
print_post(f)