from __future__ import annotations
import argparse
import re
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parent.parent
BENCH_FILE = REPO_ROOT / "BENCHMARKS.md"
CARGO_BASE = ["cargo", "bench", "--bench", "compare", "--features", "serde_json"]
PARSE_LIBS = [
"datavalue (reused arena)",
"serde_json::Value",
"simd_json (borrowed)",
"simd_json (owned)",
"sonic_rs::Value",
"json-rust",
]
PARSE_DISPLAY = {"datavalue (reused arena)": "datavalue"}
SERIALIZE_LIBS = [
"datavalue",
"serde_json::Value",
"simd_json (owned)",
"sonic_rs::Value",
"json-rust",
]
ACCESS_LIBS = SERIALIZE_LIBS
MUTATE_LIBS = [
"OwnedDataValue (clone + mutate)",
"serde_json (clone + mutate)",
"simd_json (clone + mutate)",
"json-rust (clone + mutate)",
]
FIXTURES = ["twitter", "citm", "canada"]
PARSE_HEADERS = {
"twitter": "twitter (631 KB)",
"citm": "citm (1.65 MB)",
"canada": "canada (2.15 MB)",
}
ACCESS_HEADERS = {
"twitter": "twitter (statuses[].user, retweet_count)",
"citm": "citm (events.* id + subTopicIds)",
"canada": "canada (features[].coordinates[])",
}
def run_bench(quick: bool) -> str:
cmd = list(CARGO_BASE)
if quick:
cmd += ["--", "--quick"]
print(f"$ {' '.join(cmd)}", file=sys.stderr)
proc = subprocess.run(
cmd,
cwd=REPO_ROOT,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
)
sys.stderr.write(proc.stdout[-2000:])
if proc.returncode != 0:
sys.exit(proc.returncode)
return proc.stdout
NAME_RE = re.compile(r"^(parse|serialize|access|mutate)/([^/]+)/(.+?)(\s+time:.*)?$")
BRACKET_RE = re.compile(r"\[([^\]]+)\]")
def _median(bracket_body: str) -> str | None:
parts = bracket_body.split()
if len(parts) >= 6:
return f"{parts[2]} {parts[3]}"
return None
def parse_log(text: str) -> dict[tuple[str, str, str], dict[str, str]]:
results: dict[tuple[str, str, str], dict[str, str]] = {}
current: tuple[str, str, str] | None = None
for raw in text.splitlines():
line = raw.rstrip()
if line.startswith("Benchmarking"):
continue
stripped = line.strip()
if stripped.startswith(("time:", "thrpt:")):
if current is None:
continue
kind = "time" if stripped.startswith("time:") else "thrpt"
m = BRACKET_RE.search(stripped)
if not m:
continue
median = _median(m.group(1))
if median:
results.setdefault(current, {})[kind] = median
continue
m = NAME_RE.match(line)
if not m:
continue
group, fixture, lib = m.group(1), m.group(2), m.group(3).strip()
current = (group, fixture, lib)
results.setdefault(current, {})
if m.group(4):
bm = BRACKET_RE.search(m.group(4))
if bm:
median = _median(bm.group(1))
if median:
results[current]["time"] = median
return results
TIME_NS = {"ns": 1.0, "µs": 1e3, "us": 1e3, "ms": 1e6, "s": 1e9}
SIZE_BPS = {
"B/s": 1.0,
"KiB/s": 1024.0,
"MiB/s": 1024.0**2,
"GiB/s": 1024.0**3,
"TiB/s": 1024.0**4,
}
def _to_float(s: str | None, table: dict[str, float]) -> float | None:
if not s:
return None
parts = s.split()
if len(parts) != 2 or parts[1] not in table:
return None
try:
return float(parts[0]) * table[parts[1]]
except ValueError:
return None
def time_ns(s):
return _to_float(s, TIME_NS)
def thrpt_bps(s):
return _to_float(s, SIZE_BPS)
def _best(results, group, fixture, libs, key, *, lower_is_better):
best_lib = None
best_val = float("inf") if lower_is_better else -1.0
converter = time_ns if key == "time" else thrpt_bps
for lib in libs:
v = converter(results.get((group, fixture, lib), {}).get(key))
if v is None:
continue
if (lower_is_better and v < best_val) or (not lower_is_better and v > best_val):
best_val = v
best_lib = lib
return best_lib
def _row(label: str, cells: list[str], any_best: bool) -> str:
label_md = f"**{label}**" if any_best else label
return "| " + " | ".join([label_md] + cells) + " |"
def _cell_pair(time_s, thrpt_s, mark_best: bool) -> str:
if not time_s:
return "—"
body = f"{time_s} · {thrpt_s}" if thrpt_s else time_s
return f"**{body}**" if mark_best else body
def _cell_time(time_s, mark_best: bool) -> str:
if not time_s:
return "—"
return f"**{time_s}**" if mark_best else time_s
def build_parse_table(results) -> str:
headers = [PARSE_HEADERS[f] for f in FIXTURES]
lines = ["| Library | " + " | ".join(headers) + " |", "|---|" + "---|" * len(FIXTURES)]
bests = {f: _best(results, "parse", f, PARSE_LIBS, "thrpt", lower_is_better=False) for f in FIXTURES}
for lib in PARSE_LIBS:
cells = []
for f in FIXTURES:
r = results.get(("parse", f, lib), {})
cells.append(_cell_pair(r.get("time"), r.get("thrpt"), bests[f] == lib))
display = PARSE_DISPLAY.get(lib, lib)
lines.append(_row(display, cells, any(bests[f] == lib for f in FIXTURES)))
return "\n".join(lines)
def build_serialize_table(results) -> str:
lines = [
"| Library | " + " | ".join(FIXTURES) + " |",
"|---|" + "---|" * len(FIXTURES),
]
bests = {
f: _best(results, "serialize", f, SERIALIZE_LIBS, "thrpt", lower_is_better=False)
for f in FIXTURES
}
for lib in SERIALIZE_LIBS:
cells = []
for f in FIXTURES:
r = results.get(("serialize", f, lib), {})
cells.append(_cell_pair(r.get("time"), r.get("thrpt"), bests[f] == lib))
lines.append(_row(lib, cells, any(bests[f] == lib for f in FIXTURES)))
return "\n".join(lines)
def build_access_table(results) -> str:
headers = [ACCESS_HEADERS[f] for f in FIXTURES]
lines = ["| Library | " + " | ".join(headers) + " |", "|---|" + "---|" * len(FIXTURES)]
bests = {
f: _best(results, "access", f, ACCESS_LIBS, "time", lower_is_better=True)
for f in FIXTURES
}
for lib in ACCESS_LIBS:
cells = []
for f in FIXTURES:
r = results.get(("access", f, lib), {})
cells.append(_cell_time(r.get("time"), bests[f] == lib))
lines.append(_row(lib, cells, any(bests[f] == lib for f in FIXTURES)))
return "\n".join(lines)
def build_mutate_table(results) -> str:
lines = ["| Library | time | thrpt |", "|---|---|---|"]
best_t = _best(results, "mutate", "twitter", MUTATE_LIBS, "time", lower_is_better=True)
best_h = _best(results, "mutate", "twitter", MUTATE_LIBS, "thrpt", lower_is_better=False)
for lib in MUTATE_LIBS:
r = results.get(("mutate", "twitter", lib), {})
time_cell = _cell_time(r.get("time"), best_t == lib)
thrpt_raw = r.get("thrpt")
thrpt_cell = "—" if not thrpt_raw else (
f"**{thrpt_raw}**" if best_h == lib else thrpt_raw
)
any_best = best_t == lib or best_h == lib
lines.append(_row(lib, [time_cell, thrpt_cell], any_best))
return "\n".join(lines)
def replace_block(text: str, marker_id: str, new_body: str) -> str:
pattern = re.compile(
rf"(<!-- BENCH:{re.escape(marker_id)}:BEGIN -->\n).*?(\n<!-- BENCH:{re.escape(marker_id)}:END -->)",
re.DOTALL,
)
if not pattern.search(text):
sys.exit(
f"marker BENCH:{marker_id} not found in {BENCH_FILE.relative_to(REPO_ROOT)}"
)
return pattern.sub(lambda m: m.group(1) + new_body + m.group(2), text)
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__.splitlines()[0])
ap.add_argument("--quick", action="store_true", help="pass --quick to criterion")
ap.add_argument("--from-log", metavar="PATH", help="parse this log instead of running cargo bench")
ap.add_argument("--save-log", metavar="PATH", help="also write raw bench output to this path")
args = ap.parse_args()
if args.from_log:
raw = Path(args.from_log).read_text()
else:
raw = run_bench(args.quick)
if args.save_log:
Path(args.save_log).write_text(raw)
results = parse_log(raw)
if not results:
sys.exit("no benchmark results parsed — empty or unrecognized output")
md = BENCH_FILE.read_text()
md = replace_block(md, "parse", build_parse_table(results))
md = replace_block(md, "serialize", build_serialize_table(results))
md = replace_block(md, "access", build_access_table(results))
md = replace_block(md, "mutate", build_mutate_table(results))
stamp = datetime.now(timezone.utc).strftime("%Y-%m-%d")
md = replace_block(md, "updated", f"_Last updated: {stamp} (auto-generated by `scripts/update_benchmarks.py`)._")
BENCH_FILE.write_text(md)
print(
f"updated {BENCH_FILE.relative_to(REPO_ROOT)} — {len(results)} bench results parsed",
file=sys.stderr,
)
if __name__ == "__main__":
main()