import csv
import re
import sys
from argparse import ArgumentParser, Namespace
from dataclasses import dataclass
from itertools import takewhile
from pathlib import Path
from typing import Optional
from bs4 import BeautifulSoup, NavigableString
from jinja2 import Environment, FileSystemLoader, StrictUndefined
ROOT = Path(__file__).parent
DATA_DIR = ROOT / "data"
INPUT_HTML = DATA_DIR / "marc-relators.html"
CSV_FILE = DATA_DIR / "marc-relators.csv"
OUTPUT_RS = ROOT / "src" / "inner.rs"
@dataclass(frozen=True)
class MarcRelator:
name: str
code: str
note: str
name_override: Optional[str]
@property
def variant(self) -> str:
if self.name_override:
return self.name_override.strip()
return "".join([x.capitalize() for x in re.split(" |-", self.name) if x]).strip()
@property
def doc_str(self) -> str:
return self.note.replace("[", "\\[").replace("]", "\\]")
@property
def note_str(self) -> str:
return self.note.replace("\\", "\\\\").replace('"', '\\"')
def main() -> None:
args = arg_parser().parse_args()
args.func(args)
def arg_parser() -> ArgumentParser:
parser = ArgumentParser(Path(__file__).name, description="Codegen help")
subs = parser.add_subparsers(help="Commands to run", required=True)
html = subs.add_parser("parse-html", help="Parse HTML to CSV")
html.set_defaults(func=parse_html)
code = subs.add_parser("codegen", help="Turn the CSV into Rust code")
code.set_defaults(func=codegen)
return parser
def codegen(args: Namespace) -> None:
relators = []
with open(CSV_FILE) as f:
reader = csv.DictReader(f)
for row in reader:
relator = MarcRelator(row["name"], row["code"], row["note"], row.get("name_override"))
relators.append(relator)
env = Environment(
loader=FileSystemLoader(Path(__file__).parent / "templates"),
block_start_string="//{%",
block_end_string="%}//",
variable_start_string="//{{",
variable_end_string="}}//",
comment_start_string="//#",
comment_end_string="#//",
autoescape=False,
trim_blocks=False,
lstrip_blocks=False,
keep_trailing_newline=True,
undefined=StrictUndefined,
)
template = env.get_template("inner.rs")
data = template.render(relators=relators)
trimmed_data = []
for line in data.split("\n"):
line = line.strip()
if line:
trimmed_data.append(line)
with open(OUTPUT_RS, "w") as f:
f.write("\n".join(trimmed_data))
def parse_html(args: Namespace) -> None:
print("Using cached output HTML")
with open(INPUT_HTML, "r", encoding="utf-8") as f:
html_text = f.read()
soup = BeautifulSoup(html_text, features="html5lib")
dl_nodes = soup.find_all("dl")
assert len(dl_nodes) == 4
parent = dl_nodes[3]
assert len(parent.contents) > 1000
print("Starting parsing")
output = []
skip_next_dd = False
expecting_dt = True
name = None
code = None
for node in parent.contents:
if isinstance(node, NavigableString):
continue
if expecting_dt:
if node.name != "dt":
continue
authorized = node.select("span.authorized")
if not authorized:
expecting_dt = False
skip_next_dd = True
continue
name = authorized[0].string.strip()
code = (
node.select("span.relator-code")[0].string.replace("[", "").replace("]", "").strip()
)
expecting_dt = False
elif not skip_next_dd:
css_class = node.get("class")
if css_class and "use-note" in css_class:
if node.string:
string = node.string
else:
string = list(
takewhile(lambda x: isinstance(x, NavigableString), node.children)
)[0]
output.append((name, code, string.strip()))
expecting_dt = True
else:
raise Exception("Unknown CSS classes: " + str(css_class))
else:
skip_next_dd = False
expecting_dt = True
print("Writing output")
with open(CSV_FILE, "w") as f:
writer = csv.writer(f)
writer.writerow(["name", "code", "note", "name_override"])
for row in output:
writer.writerow(row)
print("Done")
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print("")
sys.exit(1)