import youseedee
import os
import re
import argparse
import csv
EVERYONE_GETS_THESE = [0x00, 0x0D, 0x20, 0xA0]
OUTPUT_PATH = "Lib/gfsubsets/data/"
blocks = youseedee.database["Blocks.txt"]["reader"]("Blocks.txt")
aliases = {}
with open(
os.path.join(youseedee.ucd_dir(), "PropertyValueAliases.txt"), "r", newline=""
) as f:
reader = csv.reader(f, delimiter=";", skipinitialspace=True)
for row in reader:
if len(row) != 3:
continue
category, short, long_name = row
if category.strip() != "sc":
continue
aliases[long_name.strip()] = short.strip()
def block_to_chars(blockname):
found = False
for start, end, name in blocks:
if name == blockname:
found = True
yield from range(start, end + 1)
if not found:
raise Exception("Unknown block: %s" % blockname)
def script_to_chars(scriptname):
scripts = youseedee.database["Scripts.txt"]["reader"]("Scripts.txt")
script_extensions = youseedee.database["ScriptExtensions.txt"]["reader"](
"ScriptExtensions.txt"
)
if scriptname not in aliases:
raise ValueError(
f"Are you sure {scriptname} is a Unicode script? It has no short alias"
)
shortname = aliases[scriptname]
for start, end, script in scripts:
if script == scriptname:
yield from range(start, end + 1)
for start, end, scripts in script_extensions:
if shortname in scripts:
yield from list(range(start, end + 1))
def output_chars(todo, filename, args):
positive = [x for x in todo if x >= 0]
negative = [-x for x in todo if x < 0]
codepoints = (set(positive) - set(negative)) | set(EVERYONE_GETS_THESE)
with open(os.path.join(OUTPUT_PATH, os.path.basename(filename)), "w") as f:
f.write("# This file was generated by preprocess_namfile.py\n")
f.write("# It is a list of all codepoints in the Unicode block or script\n")
f.write(f"# specified in the original file {filename}.\n\n")
for codepoint in sorted(codepoints):
data = youseedee.ucd_data(codepoint)
name = data.get("Name", "")
if data.get("General_Category") == "Cn" or (not name) and "Age" not in data:
continue
f.write("0x%04X %s\n" % (codepoint, name))
def file_to_chars(filename, including=False):
with open(filename, "r") as f:
for line in f:
line = line.strip()
if "@include_only" in line:
if not including:
break
continue
if not line or line.startswith("#"):
continue
line = re.sub(r"\s+#.*$", "", line)
sign = 1
if line[0] == "-":
line = line[1:]
sign = -1
m = re.match(r"^@(block|script|include)\(([^\)]+)\)$", line)
if m:
if m.group(1) == "block":
yield from [sign * x for x in block_to_chars(m.group(2))]
elif m.group(1) == "script":
yield from [sign * x for x in script_to_chars(m.group(2))]
else:
includefile = os.path.join(os.path.dirname(filename), m.group(2))
yield from [
sign * x for x in file_to_chars(includefile, including=True)
]
continue
m = re.match(
r"^(?:U\+|0x)?([0-9A-F]+)\s*\.\.\s*(?:U\+|0x)?([0-9A-F]+)$",
line,
re.IGNORECASE,
)
if m:
yield from [
sign * x
for x in range(int(m.group(1), 16), int(m.group(2), 16) + 1)
]
continue
m = re.match(r"^(?:U\+|0x)?([0-9A-F]+)(\s|$)", line, re.IGNORECASE)
if m:
yield sign * int(m.group(1), 16)
continue
else:
raise Exception("Invalid line: %s" % line)
def main(args=None):
parser = argparse.ArgumentParser(
description="Preprocess a set of input glyphset files"
)
parser.add_argument(
"--no-label",
action="store_true",
help="Don't emit a representative glyph, just the name",
)
parser.add_argument(
"filenames", metavar="FILENAME", nargs="+", help="The input glyphset files"
)
args = parser.parse_args()
for filename in args.filenames:
print("Preprocessing " + filename)
todo = list(file_to_chars(filename))
if todo:
output_chars(todo, filename, args)
else:
print(" No characters found")
if __name__ == "__main__":
main()