prek-identify 0.3.8

File identification for prek
Documentation
# /// script
# requires-python = ">=3.14"
# dependencies = [
#     "identify>=2.6.16",
# ]
# ///
from pathlib import Path

from identify.identify import ALL_TAGS
from identify.interpreters import INTERPRETERS
from identify.extensions import EXTENSIONS, EXTENSIONS_NEED_BINARY_CHECK, NAMES


TAG_ID_CONSTS = [
    ("TAG_FILE", "file"),
    ("TAG_DIRECTORY", "directory"),
    ("TAG_SYMLINK", "symlink"),
    ("TAG_SOCKET", "socket"),
    ("TAG_EXECUTABLE", "executable"),
    ("TAG_NON_EXECUTABLE", "non-executable"),
    ("TAG_TEXT", "text"),
    ("TAG_BINARY", "binary"),
]

TAG_SET_CONSTS = [
    ("TAG_SET_FILE", ["file"]),
    ("TAG_SET_DIRECTORY", ["directory"]),
    ("TAG_SET_SYMLINK", ["symlink"]),
    ("TAG_SET_SOCKET", ["socket"]),
    ("TAG_SET_TEXT", ["text"]),
    ("TAG_SET_TEXT_OR_BINARY", ["text", "binary"]),
    ("TAG_SET_EXECUTABLE_TEXT", ["executable", "text"]),
    ("TAG_SET_JSON", ["json"]),
    ("TAG_SET_JSON5", ["json5"]),
    ("TAG_SET_TOML", ["toml"]),
    ("TAG_SET_XML", ["xml"]),
    ("TAG_SET_YAML", ["yaml"]),
]

SELF_DIR = Path(__file__).parent
TAGS_FILE = SELF_DIR / "src/tags.rs"


def gen():
    with open(TAGS_FILE, "w", newline="\n") as f:
        f.write("// This file is auto-generated by gen.py. DO NOT EDIT MANUALLY.\n\n")
        f.write("use crate::TagSet;\n\n")
        tags = sorted(set(ALL_TAGS))
        tag_to_id = {tag: idx for idx, tag in enumerate(tags)}

        def tagset_expr(tag_set):
            ids = sorted(tag_to_id[tag] for tag in tag_set)
            ids_str = ", ".join(str(tag_id) for tag_id in ids)
            return f"TagSet::new(&[{ids_str}])"

        f.write(f"pub const ALL_TAGS: [&str; {len(tags)}] = [\n")
        for tag in tags:
            f.write(f'    "{tag}",\n')
        f.write("];\n\n")

        for const_name, tag in TAG_ID_CONSTS:
            f.write(f"pub const {const_name}: u16 = {tag_to_id[tag]};\n")
        f.write("\n")

        for const_name, const_tags in TAG_SET_CONSTS:
            f.write(f"pub const {const_name}: TagSet = {tagset_expr(const_tags)};\n")
        f.write("\n")

        f.write("pub const INTERPRETERS: phf::Map<&str, TagSet> = phf::phf_map! {\n")
        for interpreter in sorted(INTERPRETERS):
            tag_names = sorted(INTERPRETERS[interpreter])
            tag_names_str = ", ".join(f'"{tag}"' for tag in tag_names)
            f.write(f"    // [{tag_names_str}]\n")
            f.write(
                f'    "{interpreter}" => {tagset_expr(INTERPRETERS[interpreter])},\n'
            )
        f.write("};\n\n")

        EXTENSIONS.update(EXTENSIONS_NEED_BINARY_CHECK)
        f.write("pub const EXTENSIONS: phf::Map<&str, TagSet> = phf::phf_map! {\n")
        for ext in sorted(EXTENSIONS):
            tag_names = sorted(EXTENSIONS[ext])
            tag_names_str = ", ".join(f'"{tag}"' for tag in tag_names)
            f.write(f"    // [{tag_names_str}]\n")
            f.write(f'    "{ext}" => {tagset_expr(EXTENSIONS[ext])},\n')
        f.write("};\n\n")

        f.write("pub const NAMES: phf::Map<&str, TagSet> = phf::phf_map! {\n")
        for name in sorted(NAMES):
            tag_names = sorted(NAMES[name])
            tag_names_str = ", ".join(f'"{tag}"' for tag in tag_names)
            f.write(f"    // [{tag_names_str}]\n")
            f.write(f'    "{name}" => {tagset_expr(NAMES[name])},\n')
        f.write("};\n")


def main():
    gen()


if __name__ == "__main__":
    main()