from __future__ import annotations
import argparse
import json
import re
import sys
from pathlib import Path
def parse_md_line(line: str) -> dict | None:
line = line.strip()
if not line.startswith("- ["):
return None
status_m = re.match(r"^- \[(.)\]", line)
if not status_m:
return None
char = status_m.group(1)
if char == "x":
status = "reviewed"
elif char == "!":
status = "broken"
elif char in ("→", ">"):
status = "reviewed"
else:
status = "pending"
link_m = re.search(r"\[([^\]]*)\]\(([^)]+)\)", line)
if not link_m:
return None
title = link_m.group(1).strip()
url = link_m.group(2).strip()
src_m = re.search(r"\)\s*—\s*([^<\n\[]+?)(?:\s*<!|\s*$)", line)
source = src_m.group(1).strip() if src_m else ""
notes_parts = re.findall(r"<!--(.*?)-->", line, re.DOTALL)
notes = " ".join(p.strip() for p in notes_parts).strip()
return {
"url": url,
"title": title,
"source": source,
"status": status,
"discovered": None,
"reviewed_date": None,
"artifacts_found": None,
"heuristics_found": None,
"notes": notes,
}
def load_existing_jsonl(path: Path) -> dict[str, dict]:
if not path.exists():
return {}
entries: dict[str, dict] = {}
with open(path, encoding="utf-8") as fh:
for lineno, line in enumerate(fh, 1):
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
if "url" in entry:
entries[entry["url"]] = entry
except json.JSONDecodeError as exc:
print(f"WARNING: skipping malformed JSONL line {lineno}: {exc}", file=sys.stderr)
return entries
def main() -> int:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--md", default="archive/sources/pending-review.md", help="source markdown file")
parser.add_argument("--jsonl", default="archive/sources/pending-review.jsonl", help="output JSONL file")
parser.add_argument("--dry-run", action="store_true", help="print what would be written, don't write")
parser.add_argument("--overwrite-notes", action="store_true",
help="update notes for already-present entries (default: preserve existing JSONL entry)")
args = parser.parse_args()
md_path = Path(args.md)
jsonl_path = Path(args.jsonl)
if not md_path.exists():
print(f"ERROR: markdown file not found: {md_path}", file=sys.stderr)
return 1
existing = load_existing_jsonl(jsonl_path)
print(f"Existing JSONL entries: {len(existing)}")
md_entries: list[dict] = []
skipped = 0
with open(md_path, encoding="utf-8") as fh:
for line in fh:
entry = parse_md_line(line)
if entry is None:
continue
md_entries.append(entry)
print(f"Parsed {len(md_entries)} entries from {md_path}")
new_entries: list[dict] = []
updated_entries: list[dict] = []
for entry in md_entries:
url = entry["url"]
if url in existing:
if args.overwrite_notes and entry["notes"]:
existing[url]["notes"] = entry["notes"]
updated_entries.append(existing[url])
skipped += 1
else:
new_entries.append(entry)
print(f"New entries to add: {len(new_entries)}")
print(f"Already-present (skipped): {skipped}")
if args.overwrite_notes:
print(f"Notes updated: {len(updated_entries)}")
if args.dry_run:
print("\n--- DRY RUN: first 5 new entries ---")
for entry in new_entries[:5]:
print(json.dumps(entry, ensure_ascii=False))
if len(new_entries) > 5:
print(f"... and {len(new_entries) - 5} more")
return 0
jsonl_path.parent.mkdir(parents=True, exist_ok=True)
if args.overwrite_notes and updated_entries:
all_entries = list(existing.values())
for entry in new_entries:
all_entries.append(entry)
existing[entry["url"]] = entry
with open(jsonl_path, "w", encoding="utf-8") as fh:
for entry in all_entries:
fh.write(json.dumps(entry, ensure_ascii=False) + "\n")
print(f"Rewrote {jsonl_path} with {len(all_entries)} entries")
else:
with open(jsonl_path, "a", encoding="utf-8") as fh:
for entry in new_entries:
fh.write(json.dumps(entry, ensure_ascii=False) + "\n")
total = len(existing) + len(new_entries)
print(f"Appended {len(new_entries)} entries to {jsonl_path} (total: {total})")
return 0
if __name__ == "__main__":
raise SystemExit(main())