marc-relators 0.1.1

Parsing and de/serialization for MARC relators
Documentation
#!/usr/bin/env python3
import csv
import re
import sys
from argparse import ArgumentParser, Namespace
from dataclasses import dataclass
from itertools import takewhile
from pathlib import Path
from typing import Optional

from bs4 import BeautifulSoup, NavigableString
from jinja2 import Environment, FileSystemLoader, StrictUndefined

ROOT = Path(__file__).parent
DATA_DIR = ROOT / "data"
INPUT_HTML = DATA_DIR / "marc-relators.html"
CSV_FILE = DATA_DIR / "marc-relators.csv"
OUTPUT_RS = ROOT / "src" / "inner.rs"


@dataclass(frozen=True)
class MarcRelator:

    name: str
    code: str
    note: str
    name_override: Optional[str]

    @property
    def variant(self) -> str:
        if self.name_override:
            return self.name_override.strip()
        return "".join([x.capitalize() for x in re.split(" |-", self.name) if x]).strip()

    @property
    def doc_str(self) -> str:
        return self.note.replace("[", "\\[").replace("]", "\\]")

    @property
    def note_str(self) -> str:
        return self.note.replace("\\", "\\\\").replace('"', '\\"')


def main() -> None:
    args = arg_parser().parse_args()
    args.func(args)


def arg_parser() -> ArgumentParser:
    parser = ArgumentParser(Path(__file__).name, description="Codegen help")
    subs = parser.add_subparsers(help="Commands to run", required=True)

    html = subs.add_parser("parse-html", help="Parse HTML to CSV")
    html.set_defaults(func=parse_html)

    code = subs.add_parser("codegen", help="Turn the CSV into Rust code")
    code.set_defaults(func=codegen)

    return parser


def codegen(args: Namespace) -> None:
    relators = []
    with open(CSV_FILE) as f:
        reader = csv.DictReader(f)
        for row in reader:
            relator = MarcRelator(row["name"], row["code"], row["note"], row.get("name_override"))
            relators.append(relator)

    env = Environment(
        loader=FileSystemLoader(Path(__file__).parent / "templates"),
        block_start_string="//{%",
        block_end_string="%}//",
        variable_start_string="//{{",
        variable_end_string="}}//",
        comment_start_string="//#",
        comment_end_string="#//",
        autoescape=False,
        trim_blocks=False,
        lstrip_blocks=False,
        keep_trailing_newline=True,
        undefined=StrictUndefined,
    )
    template = env.get_template("inner.rs")
    data = template.render(relators=relators)

    # trim the data because `cargo fmt` seems to be upset at the very long lines
    trimmed_data = []
    for line in data.split("\n"):
        line = line.strip()
        if line:
            trimmed_data.append(line)

    with open(OUTPUT_RS, "w") as f:
        f.write("\n".join(trimmed_data))


def parse_html(args: Namespace) -> None:
    # You'd think we could use `requests` for this, but it chokes on the encoding, so no
    print("Using cached output HTML")
    with open(INPUT_HTML, "r", encoding="utf-8") as f:
        html_text = f.read()

    soup = BeautifulSoup(html_text, features="html5lib")

    dl_nodes = soup.find_all("dl")
    # there are currently four of these, and we want the last one
    assert len(dl_nodes) == 4

    parent = dl_nodes[3]
    # there's a bunch off stuff, make sure we grabbed (probably) the right one
    assert len(parent.contents) > 1000

    print("Starting parsing")

    output = []
    skip_next_dd = False
    expecting_dt = True
    name = None
    code = None

    for node in parent.contents:
        if isinstance(node, NavigableString):
            continue

        if expecting_dt:
            if node.name != "dt":
                continue
            authorized = node.select("span.authorized")
            if not authorized:
                expecting_dt = False
                skip_next_dd = True
                continue
            name = authorized[0].string.strip()
            code = (
                node.select("span.relator-code")[0].string.replace("[", "").replace("]", "").strip()
            )
            expecting_dt = False
        elif not skip_next_dd:
            css_class = node.get("class")
            if css_class and "use-note" in css_class:
                if node.string:
                    string = node.string
                else:
                    string = list(
                        takewhile(lambda x: isinstance(x, NavigableString), node.children)
                    )[0]
                output.append((name, code, string.strip()))
                expecting_dt = True
            else:
                raise Exception("Unknown CSS classes: " + str(css_class))
        else:
            skip_next_dd = False
            expecting_dt = True

    print("Writing output")
    with open(CSV_FILE, "w") as f:
        writer = csv.writer(f)
        writer.writerow(["name", "code", "note", "name_override"])
        for row in output:
            writer.writerow(row)

    print("Done")


if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("")
        sys.exit(1)