google-fonts-subsets 0.202602.1

import youseedee
import os
import re
import argparse
import csv

EVERYONE_GETS_THESE = [0x00, 0x0D, 0x20, 0xA0]
OUTPUT_PATH = "Lib/gfsubsets/data/"

blocks = youseedee.database["Blocks.txt"]["reader"]("Blocks.txt")

aliases = {}
with open(
    os.path.join(youseedee.ucd_dir(), "PropertyValueAliases.txt"), "r", newline=""
) as f:
    reader = csv.reader(f, delimiter=";", skipinitialspace=True)
    for row in reader:
        if len(row) != 3:
            continue
        category, short, long_name = row
        if category.strip() != "sc":
            continue
        aliases[long_name.strip()] = short.strip()


def block_to_chars(blockname):
    found = False
    for start, end, name in blocks:
        if name == blockname:
            found = True
            yield from range(start, end + 1)
    if not found:
        raise Exception("Unknown block: %s" % blockname)


def script_to_chars(scriptname):
    scripts = youseedee.database["Scripts.txt"]["reader"]("Scripts.txt")
    script_extensions = youseedee.database["ScriptExtensions.txt"]["reader"](
        "ScriptExtensions.txt"
    )
    if scriptname not in aliases:
        raise ValueError(
            f"Are you sure {scriptname} is a Unicode script? It has no short alias"
        )

    shortname = aliases[scriptname]
    for start, end, script in scripts:
        if script == scriptname:
            yield from range(start, end + 1)

    for start, end, scripts in script_extensions:
        if shortname in scripts:
            yield from list(range(start, end + 1))


def output_chars(todo, filename, args):
    positive = [x for x in todo if x >= 0]
    negative = [-x for x in todo if x < 0]
    codepoints = (set(positive) - set(negative)) | set(EVERYONE_GETS_THESE)
    with open(os.path.join(OUTPUT_PATH, os.path.basename(filename)), "w") as f:
        f.write("# This file was generated by preprocess_namfile.py\n")
        f.write("# It is a list of all codepoints in the Unicode block or script\n")
        f.write(f"# specified in the original file {filename}.\n\n")
        for codepoint in sorted(codepoints):
            data = youseedee.ucd_data(codepoint)
            name = data.get("Name", "")
            if data.get("General_Category") == "Cn" or (not name) and "Age" not in data:
                continue
            f.write("0x%04X %s\n" % (codepoint, name))


def file_to_chars(filename, including=False):
    with open(filename, "r") as f:
        for line in f:
            # A line is either: a comment, a codepoint, a range, a Unicode block
            # or a Unicode script name, an @include statement
            line = line.strip()
            if "@include_only" in line:
                if not including:
                    break
                continue
            if not line or line.startswith("#"):
                continue
            line = re.sub(r"\s+#.*$", "", line)
            sign = 1
            if line[0] == "-":
                line = line[1:]
                sign = -1

            m = re.match(r"^@(block|script|include)\(([^\)]+)\)$", line)

            if m:
                if m.group(1) == "block":
                    yield from [sign * x for x in block_to_chars(m.group(2))]
                elif m.group(1) == "script":
                    yield from [sign * x for x in script_to_chars(m.group(2))]
                else:
                    includefile = os.path.join(os.path.dirname(filename), m.group(2))
                    yield from [
                        sign * x for x in file_to_chars(includefile, including=True)
                    ]
                continue
            # Match a range of codepoints.
            m = re.match(
                r"^(?:U\+|0x)?([0-9A-F]+)\s*\.\.\s*(?:U\+|0x)?([0-9A-F]+)$",
                line,
                re.IGNORECASE,
            )
            if m:
                yield from [
                    sign * x
                    for x in range(int(m.group(1), 16), int(m.group(2), 16) + 1)
                ]
                continue
            # Match a single codepoint.
            m = re.match(r"^(?:U\+|0x)?([0-9A-F]+)(\s|$)", line, re.IGNORECASE)
            if m:
                yield sign * int(m.group(1), 16)
                continue
            else:
                raise Exception("Invalid line: %s" % line)


def main(args=None):
    parser = argparse.ArgumentParser(
        description="Preprocess a set of input glyphset files"
    )
    parser.add_argument(
        "--no-label",
        action="store_true",
        help="Don't emit a representative glyph, just the name",
    )
    parser.add_argument(
        "filenames", metavar="FILENAME", nargs="+", help="The input glyphset files"
    )
    args = parser.parse_args()
    for filename in args.filenames:
        print("Preprocessing " + filename)
        todo = list(file_to_chars(filename))
        if todo:
            output_chars(todo, filename, args)
        else:
            print(" No characters found")


if __name__ == "__main__":
    main()