import enum
import math
import os
import re
import sys
from collections import defaultdict
from itertools import batched
NUM_CODEPOINTS = 0x110000
MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1))
class OffsetType(enum.IntEnum):
U2 = 2
U4 = 4
U8 = 8
TABLE_CFGS = [
(13, MAX_CODEPOINT_BITS, OffsetType.U8),
(6, 13, OffsetType.U8),
(0, 6, OffsetType.U2),
]
MODULE_FILENAME = "tables.rs"
Codepoint = int
BitPos = int
def fetch_open(filename: str):
basename = os.path.basename(filename)
if not os.path.exists(basename):
os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
try:
return open(basename, encoding="utf-8")
except OSError:
sys.stderr.write(f"cannot load {basename}")
sys.exit(1)
def load_unicode_version() -> "tuple[int, int, int]":
with fetch_open("ReadMe.txt") as readme:
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
return tuple(map(int, re.search(pattern, readme.read()).groups()))
class EffectiveWidth(enum.IntEnum):
ZERO = 0
NARROW = 1
WIDE = 2
AMBIGUOUS = 3
def load_east_asian_widths() -> "list[EffectiveWidth]":
with fetch_open("EastAsianWidth.txt") as eaw:
single = re.compile(r"^([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
width_codes = {
**{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]},
**{c: EffectiveWidth.WIDE for c in ["W", "F"]},
"A": EffectiveWidth.AMBIGUOUS,
}
width_map = []
current = 0
for line in eaw.readlines():
raw_data = None if match := single.match(line):
raw_data = (match.group(1), match.group(1), match.group(2))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2), match.group(3))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
width = width_codes[raw_data[2]]
assert current <= high
while current <= high:
width_map.append(EffectiveWidth.NARROW if current < low else width)
current += 1
while len(width_map) < NUM_CODEPOINTS:
width_map.append(EffectiveWidth.NARROW)
return width_map
def load_zero_widths() -> "list[bool]":
zw_map = [False] * NUM_CODEPOINTS
for c in range(0x00, 0x20):
zw_map[c] = True
for c in range(0x7F, 0xA0):
zw_map[c] = True
with fetch_open("DerivedCoreProperties.txt") as properties:
single = re.compile(
r"^([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
)
multiple = re.compile(
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
)
for line in properties.readlines():
raw_data = None if match := single.match(line):
raw_data = (match.group(1), match.group(1))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
for cp in range(low, high + 1):
zw_map[cp] = True
for c in [0x0CC0, 0x0CC7, 0x0CC8, 0x0CCA, 0x0CCB, 0x1B3B, 0x1B3D, 0x1B43]:
zw_map[c] = True
with fetch_open("HangulSyllableType.txt") as categories:
single = re.compile(r"^([0-9A-F]+)\s+;\s+(V|T)\s+")
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(V|T)\s+")
for line in categories.readlines():
raw_data = None if match := single.match(line):
raw_data = (match.group(1), match.group(1))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
for cp in range(low, high + 1):
zw_map[cp] = True
zw_map[0x115F] = False
return zw_map
class Bucket:
def __init__(self):
self.entry_set = set()
self.widths = []
def append(self, codepoint: Codepoint, width: EffectiveWidth):
self.entry_set.add((codepoint, width))
self.widths.append(width)
def try_extend(self, attempt: "Bucket") -> bool:
(less, more) = (self.widths, attempt.widths)
if len(self.widths) > len(attempt.widths):
(less, more) = (attempt.widths, self.widths)
if less != more[: len(less)]:
return False
self.entry_set |= attempt.entry_set
self.widths = more
return True
def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]":
result = list(self.entry_set)
result.sort()
return result
def width(self) -> "EffectiveWidth | None":
if len(self.widths) == 0:
return None
potential_width = self.widths[0]
for width in self.widths[1:]:
if potential_width != width:
return None
return potential_width
def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
num_bits = cap_bit - low_bit
assert num_bits > 0
buckets = [Bucket() for _ in range(0, 2**num_bits)]
mask = (1 << num_bits) - 1
for codepoint, width in entries:
buckets[(codepoint >> low_bit) & mask].append(codepoint, width)
return buckets
class Table:
def __init__(
self, entry_groups, low_bit: BitPos, cap_bit: BitPos, offset_type: OffsetType
):
self.low_bit = low_bit
self.cap_bit = cap_bit
self.offset_type = offset_type
self.entries = []
self.indexed = []
buckets = []
for entries in entry_groups:
buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit))
for bucket in buckets:
for i, existing in enumerate(self.indexed):
if existing.try_extend(bucket):
self.entries.append(i)
break
else:
self.entries.append(len(self.indexed))
self.indexed.append(bucket)
for index in self.entries:
assert index < (1 << int(self.offset_type))
def indices_to_widths(self):
self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries))
del self.indexed
def buckets(self):
return self.indexed
def to_bytes(self) -> "list[int]":
entries_per_byte = 8 // int(self.offset_type)
byte_array = []
for i in range(0, len(self.entries), entries_per_byte):
byte = 0
for j in range(0, entries_per_byte):
byte |= self.entries[i + j] << (j * int(self.offset_type))
byte_array.append(byte)
return byte_array
def make_tables(
table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries
) -> "list[Table]":
tables = []
entry_groups = [entries]
for low_bit, cap_bit, offset_type in table_cfgs:
table = Table(entry_groups, low_bit, cap_bit, offset_type)
entry_groups = map(lambda bucket: bucket.entries(), table.buckets())
tables.append(table)
return tables
def load_variation_sequences() -> "list[int]":
with fetch_open("emoji/emoji-variation-sequences.txt") as sequences:
sequence = re.compile(r"^([0-9A-F]+)\s+FE0F\s*;\s+emoji style")
codepoints = []
for line in sequences.readlines():
if match := sequence.match(line):
cp = int(match.group(1), 16)
codepoints.append(cp)
return codepoints
def make_variation_sequence_table(
seqs: "list[int]",
width_map: "list[EffectiveWidth]",
) -> "tuple[list[int], list[list[int]]]":
prefixes_dict = defaultdict(set)
for cp in seqs:
prefixes_dict[cp >> 10].add(cp & 0x3FF)
for k in list(prefixes_dict.keys()):
if all(
map(
lambda cp: width_map[(k << 10) | cp] == EffectiveWidth.WIDE,
prefixes_dict[k],
)
):
del prefixes_dict[k]
indexes = list(prefixes_dict.keys())
for cp, width in enumerate(width_map):
if width == EffectiveWidth.WIDE and (cp >> 10) in indexes:
prefixes_dict[cp >> 10].add(cp & 0x3FF)
leaves = []
for cps in prefixes_dict.values():
leaf = [0] * 128
for cp in cps:
idx_in_leaf, bit_shift = divmod(cp, 8)
leaf[idx_in_leaf] |= 1 << bit_shift
leaves.append(leaf)
return (indexes, leaves)
def emit_module(
out_name: str,
unicode_version: "tuple[int, int, int]",
tables: "list[Table]",
variation_table: "tuple[list[int], list[list[int]]]",
):
if os.path.exists(out_name):
os.remove(out_name)
with open(out_name, "w", newline="\n", encoding="utf-8") as module:
module.write(
"""// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.
// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
"""
)
module.write(
f"""
/// The version of [Unicode](http://www.unicode.org/)
/// that this version of unicode-width is based on.
pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version};
"""
)
module.write(
"""
pub mod charwidth {
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
/// consulting a multi-level lookup table.
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
/// they're treated as single width.
///
/// # Maintenance
/// The tables themselves are autogenerated but this function is hardcoded. You should have
/// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
/// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
/// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes.
#[inline]
fn lookup_width(c: char, is_cjk: bool) -> usize {
let cp = c as usize;
let t1_offset = TABLES_0[cp >> 13 & 0xFF];
// Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte,
// so each sub-table is 128 bytes in size.
// (Sub-tables are selected using the computed offset from the previous table.)
let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
// Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits.
// This is accomplished by packing four stored entries into one byte.
// So each sub-table is 2**(6-2) == 16 bytes in size.
// Since this is the last table, each entry represents an encoded width.
let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
// Extract the packed width
let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
// A width of 3 signifies that the codepoint is ambiguous width.
if width == 3 {
if is_cjk {
2
} else {
1
}
} else {
width.into()
}
}
"""
)
variation_idx, variation_leaves = variation_table
module.write(
"""
/// Whether this character forms an [emoji presentation sequence]
/// (https://www.unicode.org/reports/tr51/#def_emoji_presentation_sequence)
/// when followed by `'\\u{FEOF}'`.
/// Emoji presentation sequences are considered to have width 2.
/// This may spuriously return `true` or `false` for characters that are always wide.
#[inline]
pub fn starts_emoji_presentation_seq(c: char) -> bool {
let cp: u32 = c.into();
// First level of lookup uses all but 10 LSB
let top_bits = cp >> 10;
let idx_of_leaf: usize = match top_bits {
"""
)
for i, msbs in enumerate(variation_idx):
module.write(f" {msbs} => {i},\n")
module.write(
""" _ => return false,
};
// Extract the 3-9th (0-indexed) least significant bits of `cp`,
// and use them to index into `leaf_row`.
let idx_within_leaf = usize::try_from((cp >> 3) & 0x7F).unwrap();
let leaf_byte = EMOJI_PRESENTATION_LEAVES.0[idx_of_leaf][idx_within_leaf];
// Use the 3 LSB of `cp` to index into `leaf_byte`.
((leaf_byte >> (cp & 7)) & 1) == 1
}
"""
)
module.write(
"""
/// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
/// `None` if `c` is a control character other than `'\\x00'`.
/// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
/// they're treated as single width.
#[inline]
pub fn width(c: char, is_cjk: bool) -> Option<usize> {
if c < '\\u{7F}' {
if c >= '\\u{20}' {
// U+0020 to U+007F (exclusive) are single-width ASCII codepoints
Some(1)
} else if c == '\\0' {
// U+0000 *is* a control code, but it's special-cased
Some(0)
} else {
// U+0001 to U+0020 (exclusive) are control codes
None
}
} else if c >= '\\u{A0}' {
// No characters >= U+00A0 are control codes, so we can consult the lookup tables
Some(lookup_width(c, is_cjk))
} else {
// U+007F to U+00A0 (exclusive) are control codes
None
}
}
"""
)
subtable_count = 1
for i, table in enumerate(tables):
new_subtable_count = len(table.buckets())
if i == len(tables) - 1:
table.indices_to_widths() byte_array = table.to_bytes()
module.write(
f"""
/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
static TABLES_{i}: [u8; {len(byte_array)}] = ["""
)
for j, byte in enumerate(byte_array):
if j % 15 == 0:
module.write("\n ")
module.write(f" 0x{byte:02X},")
module.write("\n ];\n")
subtable_count = new_subtable_count
module.write(
f"""
#[repr(align(128))]
struct Align128<T>(T);
/// Array of 1024-bit bitmaps. Index into the correct (obtained from `EMOJI_PRESENTATION_INDEX`)
/// bitmap with the 10 LSB of your codepoint to get whether it can start an emoji presentation seq.
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(variation_leaves)}]> = Align128([
"""
)
for leaf in variation_leaves:
module.write(" [\n")
for row in batched(leaf, 14):
module.write(" ")
for entry in row:
module.write(f" 0x{entry:02X},")
module.write("\n")
module.write(" ],\n")
module.write(" ]);\n")
module.write("}\n")
def main(module_filename: str):
version = load_unicode_version()
print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")
eaw_map = load_east_asian_widths()
zw_map = load_zero_widths()
width_map = list(
map(lambda x: EffectiveWidth.ZERO if x[1] else x[0], zip(eaw_map, zw_map))
)
width_map[0x00AD] = EffectiveWidth.NARROW
tables = make_tables(TABLE_CFGS, enumerate(width_map))
emoji_variations = load_variation_sequences()
variation_table = make_variation_sequence_table(emoji_variations, width_map)
print("------------------------")
total_size = 0
for i, table in enumerate(tables):
size_bytes = len(table.to_bytes())
print(f"Table {i} size: {size_bytes} bytes")
total_size += size_bytes
emoji_index_size = len(variation_table[0]) * 4
print(f"Emoji presentation index size: {emoji_index_size} bytes")
total_size += emoji_index_size
emoji_leaves_size = len(variation_table[1]) * len(variation_table[1][0])
print(f"Emoji presentation leaves size: {emoji_leaves_size} bytes")
total_size += emoji_leaves_size
print("------------------------")
print(f" Total size: {total_size} bytes")
emit_module(module_filename, version, tables, variation_table)
print(f'Wrote to "{module_filename}"')
if __name__ == "__main__":
main(MODULE_FILENAME)