name: Benchmark Suite
on:
push:
branches:
- main
pull_request:
branches:
- main
workflow_dispatch:
schedule:
- cron: "0 4 * * *"
concurrency:
group: benchmark-suite-${{ github.ref }}
cancel-in-progress: true
permissions:
actions: read
contents: read
env:
CARGO_TERM_COLOR: always
PIP_DISABLE_PIP_VERSION_CHECK: "1"
PIP_NO_PYTHON_VERSION_WARNING: "1"
R_LIBS_USER: ${{ github.workspace }}/.r-lib
BENCH_R_VERSION: "4.4.3"
BENCH_RUNTIME_CACHE_VERSION: v2
jobs:
prepare:
name: Prepare benchmark matrix
runs-on: ubuntu-latest
outputs:
parallel_matrix: ${{ steps.matrix.outputs.parallel_matrix }}
parallel_count: ${{ steps.matrix.outputs.parallel_count }}
serial_matrix: ${{ steps.matrix.outputs.serial_matrix }}
serial_count: ${{ steps.matrix.outputs.serial_count }}
steps:
- name: Checkout
uses: actions/checkout@v5
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install Python deps for preflight
shell: bash
run: |
set -euo pipefail
python3 -m pip install --upgrade pip setuptools wheel
python3 -m pip install numpy pandas scikit-learn lifelines
- name: Preflight scenario schemas
shell: bash
run: |
set -euo pipefail
python3 - <<'PY'
import json
import importlib.util
from pathlib import Path
mod_path = Path("bench/run_suite.py").resolve()
spec = importlib.util.spec_from_file_location("run_suite_mod", mod_path)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
cfg = json.loads(Path("bench/scenarios.json").read_text())
scenarios = cfg.get("scenarios", [])
if not scenarios:
raise SystemExit("No benchmark scenarios found in bench/scenarios.json")
for s in scenarios:
mod.dataset_for_scenario(s)
print(f"validated {len(scenarios)} scenario dataset schemas")
PY
- name: Preflight geo_subpop16 simulation
shell: bash
run: |
set -euo pipefail
python3 - <<'PY'
import json
import importlib.util
from pathlib import Path
mod_path = Path("bench/run_suite.py").resolve()
spec = importlib.util.spec_from_file_location("run_suite_mod", mod_path)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
cfg = json.loads(Path("bench/scenarios.json").read_text())
scenarios = cfg.get("scenarios", [])
geo_subpop = [s for s in scenarios if str(s.get("name", "")).startswith("geo_subpop16_")]
if not geo_subpop:
raise SystemExit("No geo_subpop16 scenarios found in bench/scenarios.json")
for s in geo_subpop:
mod.dataset_for_scenario(s)
print(f"validated geo_subpop16 simulation for {len(geo_subpop)} scenarios")
PY
- name: Build scenario matrix
id: matrix
shell: bash
run: |
python3 - <<'PY'
import json
import os
import re
from pathlib import Path
SERIAL_SCENARIOS = {
"icu_survival_death",
"cirrhosis_survival",
}
def choose_single_knot(variants):
# Prefer k12 when present for comparability across families.
by_k = {v["k"]: v for v in variants}
if 12 in by_k:
return by_k[12]["scenario"]
ordered = sorted(variants, key=lambda v: v["k"])
return ordered[len(ordered) // 2]["scenario"]
cfg = json.loads(Path("bench/scenarios.json").read_text())
scenarios = cfg.get("scenarios", [])
names = [s["name"] for s in scenarios if "name" in s]
if not names:
raise SystemExit("No benchmark scenarios found in bench/scenarios.json")
event_name = os.environ.get("GITHUB_EVENT_NAME", "").strip().lower()
is_nightly = event_name == "schedule"
if is_nightly:
# Nightly: run full matrix (all knot variants).
selected = names
else:
# Push/manual: run one representative knot per *_kN family.
k_pat = re.compile(r"^(?P<base>.+)_k(?P<k>\d+)$")
families = {}
selected = []
for name in names:
m = k_pat.match(name)
if not m:
selected.append(name)
continue
base = m.group("base")
families.setdefault(base, []).append(
{"scenario": name, "k": int(m.group("k"))}
)
for base in sorted(families):
selected.append(choose_single_knot(families[base]))
include = [{"scenario": name} for name in selected]
if not include:
raise SystemExit("No benchmark scenarios found in bench/scenarios.json")
serial_include = [row for row in include if row["scenario"] in SERIAL_SCENARIOS]
parallel_include = [row for row in include if row["scenario"] not in SERIAL_SCENARIOS]
parallel_matrix = {"include": parallel_include}
serial_matrix = {"include": serial_include}
out = Path(os.environ["GITHUB_OUTPUT"])
with out.open("a", encoding="utf-8") as fh:
fh.write(f"parallel_matrix={json.dumps(parallel_matrix)}\n")
fh.write(f"parallel_count={len(parallel_include)}\n")
fh.write(f"serial_matrix={json.dumps(serial_matrix)}\n")
fh.write(f"serial_count={len(serial_include)}\n")
print(
json.dumps(
{
"event": event_name,
"nightly_full_matrix": is_nightly,
"scenario_count": len(selected),
"parallel_count": len(parallel_include),
"serial_count": len(serial_include),
"parallel_matrix": parallel_matrix,
"serial_matrix": serial_matrix,
}
)
)
PY
bootstrap-runtime:
name: Bootstrap benchmark runtime
needs: prepare
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v5
with:
fetch-depth: 1
- name: Restore shared runtime cache
id: runtime-cache
uses: actions/cache/restore@v4
with:
path: |
bench/runtime/pydeps
bench/runtime/r-lib
key: ${{ runner.os }}-bench-runtime-${{ env.BENCH_RUNTIME_CACHE_VERSION }}-${{ hashFiles('.github/actions/bench-setup/action.yml', '.github/workflows/benchmark.yml') }}
restore-keys: |
${{ runner.os }}-bench-runtime-${{ env.BENCH_RUNTIME_CACHE_VERSION }}-
- name: Shared benchmark setup (cached)
if: ${{ steps.runtime-cache.outputs.cache-hit != 'true' }}
uses: ./.github/actions/bench-setup
with:
install-runtime-deps: "true"
r-version: ${{ env.BENCH_R_VERSION }}
- name: Install Rust stable
uses: dtolnay/rust-toolchain@stable
- name: Cache cargo artifacts
uses: Swatinem/rust-cache@v2
- name: Build shared runtime
shell: bash
run: |
set -euo pipefail
mkdir -p bench/runtime
if [ "${{ steps.runtime-cache.outputs.cache-hit }}" != "true" ]; then
mkdir -p bench/runtime/pydeps bench/runtime/r-lib
if [ -d "${R_LIBS_USER}" ]; then
cp -a "${R_LIBS_USER}/." bench/runtime/r-lib/ || true
fi
python3 -m pip install --upgrade pip setuptools wheel
python3 -m pip install --target bench/runtime/pydeps numpy pandas scikit-learn lifelines scikit-survival xgboost matplotlib
fi
cargo build --release --bin gam
cp target/release/gam bench/runtime/gam
chmod +x bench/runtime/gam
- name: Save shared runtime cache
if: ${{ steps.runtime-cache.outputs.cache-hit != 'true' }}
uses: actions/cache/save@v4
with:
path: |
bench/runtime/pydeps
bench/runtime/r-lib
key: ${{ steps.runtime-cache.outputs.cache-primary-key }}
- name: Upload shared runtime artifact
uses: actions/upload-artifact@v4
with:
name: bench-runtime
path: bench/runtime
if-no-files-found: error
bench-shard:
name: Bench ${{ matrix.scenario }}
needs:
- prepare
- bootstrap-runtime
if: ${{ needs.prepare.outputs.parallel_count != '0' }}
runs-on: ubuntu-latest
timeout-minutes: 47
strategy:
fail-fast: false
max-parallel: 8
matrix: ${{ fromJSON(needs.prepare.outputs.parallel_matrix) }}
env:
BENCH_CI_PROFILE: ${{ github.event_name == 'schedule' && 'full' || 'lean' }}
BENCH_RAYON_THREADS: "4"
BENCH_BLAS_THREADS: "1"
BENCH_CMD_TIMEOUT_SEC: "10800"
steps:
- name: Checkout
uses: actions/checkout@v5
with:
fetch-depth: 1
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Set up R
uses: ./.github/actions/setup-bench-r
with:
r-version: ${{ env.BENCH_R_VERSION }}
- name: Download shared runtime
shell: bash
env:
GH_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
python3 - <<'PY'
import json
import subprocess
import zipfile
from pathlib import Path
repo = "${{ github.repository }}"
run_id = "${{ github.run_id }}"
target_name = "bench-runtime"
out_dir = Path("bench/runtime")
out_dir.mkdir(parents=True, exist_ok=True)
proc = subprocess.run(
["gh", "api", f"/repos/{repo}/actions/runs/{run_id}/artifacts?per_page=100"],
check=True,
capture_output=True,
text=True,
)
artifacts = json.loads(proc.stdout).get("artifacts", [])
match = next((a for a in artifacts if a.get("name") == target_name and not a.get("expired")), None)
if match is None:
raise SystemExit(f"artifact {target_name!r} not found for run {run_id}")
zip_path = Path("bench-runtime.zip")
with zip_path.open("wb") as fh:
subprocess.run(
["gh", "api", f"/repos/{repo}/actions/artifacts/{match['id']}/zip"],
check=True,
stdout=fh,
)
with zipfile.ZipFile(zip_path) as zf:
zf.extractall(out_dir)
zip_path.unlink()
PY
- name: Run scenario shard
shell: bash
env:
PYTHONPATH: ${{ github.workspace }}/bench/runtime/pydeps
R_LIBS_USER: ${{ github.workspace }}/bench/runtime/r-lib
BENCH_GAM_BIN: ${{ github.workspace }}/bench/runtime/gam
run: |
set -euo pipefail
echo "== Benchmark shard diagnostics =="
echo "pwd: $(pwd)"
echo "scenario: ${{ matrix.scenario }}"
echo "python: $(command -v python3)"
python3 --version
echo "Rscript: $(command -v Rscript)"
Rscript --version
echo "BENCH_GAM_BIN=${BENCH_GAM_BIN}"
echo "BENCH_CI_PROFILE=${BENCH_CI_PROFILE}"
echo "BENCH_RAYON_THREADS=${BENCH_RAYON_THREADS}"
echo "BENCH_BLAS_THREADS=${BENCH_BLAS_THREADS}"
echo "BENCH_CMD_TIMEOUT_SEC=${BENCH_CMD_TIMEOUT_SEC}"
echo "PYTHONPATH=${PYTHONPATH}"
echo "R_LIBS_USER=${R_LIBS_USER}"
echo "bench/runtime tree:"
ls -la bench/runtime || true
ls -la bench/runtime/pydeps | sed -n '1,40p' || true
ls -la bench/runtime/r-lib | sed -n '1,40p' || true
echo "gam binary before chmod:"
ls -l "${BENCH_GAM_BIN}" || true
file "${BENCH_GAM_BIN}" || true
chmod +x "${BENCH_GAM_BIN}" || true
echo "gam binary after chmod:"
ls -l "${BENCH_GAM_BIN}" || true
file "${BENCH_GAM_BIN}" || true
test -x "${BENCH_GAM_BIN}"
echo "smoke-import python deps"
python3 - <<'PY'
import importlib.util
from pathlib import Path
import numpy, pandas, lifelines, sksurv, xgboost
scenario_name = "${{ matrix.scenario }}"
mod_path = Path("bench/run_suite.py").resolve()
spec = importlib.util.spec_from_file_location("run_suite_mod", mod_path)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
print(f"python deps ok (scenario={scenario_name})")
PY
echo "smoke-load R deps"
Rscript -e 'library(mgcv); library(jsonlite); library(survival); library(glmnet); library(gamlss); library(gamlss.dist); library(gamboostLSS); library(mboost); library(bamlss); library(brms); cat("R deps ok\n")'
mkdir -p bench/results-shards
echo "running benchmark shard..."
# Wrap the bench command in GNU timeout so an over-budget shard
# FAILS (exit 124 / 137) rather than being cancelled by the GH
# job-level timeout-minutes ceiling. We set the GNU timeout
# several minutes shorter than that ceiling so the shard exits
# cleanly before GH would auto-cancel; the explicit `exit 1`
# below ensures the conclusion is `failure`, not `cancelled`.
BENCH_SHARD_TIMEOUT="${BENCH_SHARD_TIMEOUT:-42m}"
set +e
timeout --signal=TERM --kill-after=30s "${BENCH_SHARD_TIMEOUT}" \
python3 bench/run_suite.py \
--scenarios bench/scenarios.json \
--scenario-name "${{ matrix.scenario }}" \
--out "bench/results-shards/${{ matrix.scenario }}.json"
rc=$?
set -e
if [ "${rc}" -ne 0 ]; then
if [ "${rc}" = "124" ] || [ "${rc}" = "137" ]; then
echo "::error title=Benchmark shard timeout::Scenario ${{ matrix.scenario }} exceeded ${BENCH_SHARD_TIMEOUT} wall budget (exit ${rc}); failing the job rather than being cancelled by the GH job-level ceiling."
else
echo "::error title=Benchmark shard error::Scenario ${{ matrix.scenario }} failed with exit ${rc}."
fi
exit 1
fi
echo "shard output:"
ls -l "bench/results-shards/${{ matrix.scenario }}.json"
- name: Upload shard artifact
uses: actions/upload-artifact@v4
with:
name: bench-${{ matrix.scenario }}
path: |
bench/results-shards/${{ matrix.scenario }}.json
bench/results-shards/figures/${{ matrix.scenario }}.png
if-no-files-found: error
bench-shard-serial:
name: Bench ${{ matrix.scenario }} (serial)
needs:
- prepare
- bootstrap-runtime
if: ${{ needs.prepare.outputs.serial_count != '0' }}
runs-on: ubuntu-24.04
timeout-minutes: 47
strategy:
fail-fast: false
max-parallel: 1
matrix: ${{ fromJSON(needs.prepare.outputs.serial_matrix) }}
env:
BENCH_CI_PROFILE: ${{ github.event_name == 'schedule' && 'full' || 'lean' }}
BENCH_RAYON_THREADS: "4"
BENCH_BLAS_THREADS: "1"
BENCH_CMD_TIMEOUT_SEC: "10800"
steps:
- name: Checkout
uses: actions/checkout@v5
with:
fetch-depth: 1
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Set up R
uses: ./.github/actions/setup-bench-r
with:
r-version: ${{ env.BENCH_R_VERSION }}
- name: Download shared runtime
shell: bash
env:
GH_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
python3 - <<'PY'
import json
import subprocess
import zipfile
from pathlib import Path
repo = "${{ github.repository }}"
run_id = "${{ github.run_id }}"
target_name = "bench-runtime"
out_dir = Path("bench/runtime")
out_dir.mkdir(parents=True, exist_ok=True)
proc = subprocess.run(
["gh", "api", f"/repos/{repo}/actions/runs/{run_id}/artifacts?per_page=100"],
check=True,
capture_output=True,
text=True,
)
artifacts = json.loads(proc.stdout).get("artifacts", [])
match = next((a for a in artifacts if a.get("name") == target_name and not a.get("expired")), None)
if match is None:
raise SystemExit(f"artifact {target_name!r} not found for run {run_id}")
zip_path = Path("bench-runtime.zip")
with zip_path.open("wb") as fh:
subprocess.run(
["gh", "api", f"/repos/{repo}/actions/artifacts/{match['id']}/zip"],
check=True,
stdout=fh,
)
with zipfile.ZipFile(zip_path) as zf:
zf.extractall(out_dir)
zip_path.unlink()
PY
- name: Run scenario shard
shell: bash
env:
PYTHONPATH: ${{ github.workspace }}/bench/runtime/pydeps
R_LIBS_USER: ${{ github.workspace }}/bench/runtime/r-lib
BENCH_GAM_BIN: ${{ github.workspace }}/bench/runtime/gam
run: |
set -euo pipefail
echo "== Benchmark shard diagnostics =="
echo "pwd: $(pwd)"
echo "scenario: ${{ matrix.scenario }}"
echo "python: $(command -v python3)"
python3 --version
echo "Rscript: $(command -v Rscript)"
Rscript --version
echo "BENCH_GAM_BIN=${BENCH_GAM_BIN}"
echo "BENCH_CI_PROFILE=${BENCH_CI_PROFILE}"
echo "BENCH_RAYON_THREADS=${BENCH_RAYON_THREADS}"
echo "BENCH_BLAS_THREADS=${BENCH_BLAS_THREADS}"
echo "BENCH_CMD_TIMEOUT_SEC=${BENCH_CMD_TIMEOUT_SEC}"
echo "PYTHONPATH=${PYTHONPATH}"
echo "R_LIBS_USER=${R_LIBS_USER}"
echo "bench/runtime tree:"
ls -la bench/runtime || true
ls -la bench/runtime/pydeps | sed -n '1,40p' || true
ls -la bench/runtime/r-lib | sed -n '1,40p' || true
echo "gam binary before chmod:"
ls -l "${BENCH_GAM_BIN}" || true
file "${BENCH_GAM_BIN}" || true
chmod +x "${BENCH_GAM_BIN}" || true
echo "gam binary after chmod:"
ls -l "${BENCH_GAM_BIN}" || true
file "${BENCH_GAM_BIN}" || true
test -x "${BENCH_GAM_BIN}"
echo "smoke-import python deps"
python3 - <<'PY'
import importlib.util
from pathlib import Path
import numpy, pandas, lifelines, sksurv, xgboost
scenario_name = "${{ matrix.scenario }}"
mod_path = Path("bench/run_suite.py").resolve()
spec = importlib.util.spec_from_file_location("run_suite_mod", mod_path)
mod = importlib.util.module_from_spec(spec)
spec.loader.exec_module(mod)
print(f"python deps ok (scenario={scenario_name})")
PY
echo "smoke-load R deps"
Rscript -e 'library(mgcv); library(jsonlite); library(survival); library(glmnet); library(gamlss); library(gamlss.dist); library(gamboostLSS); library(mboost); library(bamlss); library(brms); cat("R deps ok\n")'
mkdir -p bench/results-shards
echo "running benchmark shard..."
# Wrap the bench command in GNU timeout so an over-budget shard
# FAILS (exit 124 / 137) rather than being cancelled by the GH
# job-level timeout-minutes ceiling. We set the GNU timeout
# several minutes shorter than that ceiling so the shard exits
# cleanly before GH would auto-cancel; the explicit `exit 1`
# below ensures the conclusion is `failure`, not `cancelled`.
BENCH_SHARD_TIMEOUT="${BENCH_SHARD_TIMEOUT:-42m}"
set +e
timeout --signal=TERM --kill-after=30s "${BENCH_SHARD_TIMEOUT}" \
python3 bench/run_suite.py \
--scenarios bench/scenarios.json \
--scenario-name "${{ matrix.scenario }}" \
--out "bench/results-shards/${{ matrix.scenario }}.json"
rc=$?
set -e
if [ "${rc}" -ne 0 ]; then
if [ "${rc}" = "124" ] || [ "${rc}" = "137" ]; then
echo "::error title=Benchmark shard timeout::Scenario ${{ matrix.scenario }} exceeded ${BENCH_SHARD_TIMEOUT} wall budget (exit ${rc}); failing the job rather than being cancelled by the GH job-level ceiling."
else
echo "::error title=Benchmark shard error::Scenario ${{ matrix.scenario }} failed with exit ${rc}."
fi
exit 1
fi
echo "shard output:"
ls -l "bench/results-shards/${{ matrix.scenario }}.json"
- name: Upload shard artifact
uses: actions/upload-artifact@v4
with:
name: bench-${{ matrix.scenario }}
path: |
bench/results-shards/${{ matrix.scenario }}.json
bench/results-shards/figures/${{ matrix.scenario }}.png
if-no-files-found: error
aggregate:
name: Aggregate benchmark results
needs:
- bench-shard
- bench-shard-serial
runs-on: ubuntu-latest
steps:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Checkout for figure generation
uses: actions/checkout@v5
with:
fetch-depth: 1
sparse-checkout: bench
- name: Download all shard artifacts
shell: bash
env:
GH_TOKEN: ${{ github.token }}
run: |
set -euo pipefail
python3 - <<'PY'
import json
import subprocess
import zipfile
from pathlib import Path
repo = "${{ github.repository }}"
run_id = "${{ github.run_id }}"
root = Path("bench/artifacts")
root.mkdir(parents=True, exist_ok=True)
proc = subprocess.run(
["gh", "api", f"/repos/{repo}/actions/runs/{run_id}/artifacts?per_page=100"],
check=True,
capture_output=True,
text=True,
)
artifacts = json.loads(proc.stdout).get("artifacts", [])
shard_artifacts = [
a for a in artifacts
if str(a.get("name", "")).startswith("bench-")
and str(a.get("name", "")) != "bench-runtime"
and not a.get("expired")
]
if not shard_artifacts:
raise SystemExit("No shard artifacts found to merge.")
for artifact in shard_artifacts:
artifact_dir = root / str(artifact["name"])
artifact_dir.mkdir(parents=True, exist_ok=True)
zip_path = artifact_dir.with_suffix(".zip")
with zip_path.open("wb") as fh:
subprocess.run(
["gh", "api", f"/repos/{repo}/actions/artifacts/{artifact['id']}/zip"],
check=True,
stdout=fh,
)
with zipfile.ZipFile(zip_path) as zf:
zf.extractall(artifact_dir)
zip_path.unlink()
PY
- name: Merge shard JSON outputs
shell: bash
run: |
python3 - <<'PY'
import json
import os
from datetime import datetime, timezone
from pathlib import Path
def fmt_num(v, digits=4):
if v is None:
return "—"
try:
return f"{float(v):.{digits}f}"
except Exception:
return "—"
def fmt_status(row):
status = str(row.get("status", "unknown"))
if status == "ok":
return "ok"
return f"failed: {row.get('error', 'unknown error')}"
def md_escape(v):
if v is None:
return "—"
s = str(v).replace("\n", " ").strip()
return s.replace("|", "\\|")
artifacts = Path("bench/artifacts")
files = sorted(artifacts.rglob("*.json"))
if not files:
raise SystemExit("No shard artifacts found to merge.")
merged = []
for fp in files:
payload = json.loads(fp.read_text())
merged.extend(payload.get("results", []))
merged_sorted = sorted(
merged,
key=lambda r: (
str(r.get("scenario_name", "")),
str(r.get("contender", "")),
),
)
out = Path("bench/results.nightly.json")
out.write_text(
json.dumps(
{
"created_at_utc": datetime.now(timezone.utc).isoformat(),
"results": merged,
},
indent=2,
)
)
print(f"Wrote {out} from {len(files)} shard files with {len(merged)} rows.")
summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
if summary_path:
ok_count = sum(1 for r in merged_sorted if str(r.get("status", "")) == "ok")
fail_count = len(merged_sorted) - ok_count
family_labels = {"binomial": "Classification", "gaussian": "Regression", "survival": "Survival"}
family_rows = {}
for r in merged_sorted:
fam = str(r.get("family") or "unknown")
family_rows.setdefault(fam, []).append(r)
def primary_metric(row):
fam = str(row.get("family") or "")
if fam == "binomial":
return "Brier", row.get("brier"), "min"
if fam == "gaussian":
return "RMSE", row.get("rmse"), "min"
if fam == "survival":
return "C-index", row.get("auc"), "max"
return "n/a", None, "min"
def winner_for_family(fam):
rows = [r for r in family_rows.get(fam, []) if str(r.get("status", "")) == "ok"]
if not rows:
return None
vals = []
for r in rows:
_, v, mode = primary_metric(r)
if v is None:
continue
vals.append((float(v), r, mode))
if not vals:
return None
mode = vals[0][2]
return (min(vals, key=lambda x: x[0]) if mode == "min" else max(vals, key=lambda x: x[0]))[1]
lines = []
lines.append("## Benchmark Results")
lines.append("")
lines.append(f"- Generated: `{datetime.now(timezone.utc).isoformat()}`")
lines.append(f"- Shards merged: `{len(files)}`")
lines.append(f"- Result rows: `{len(merged_sorted)}` (`{ok_count}` ok, `{fail_count}` failed)")
lines.append("- Primary metrics by task:")
lines.append(" - Classification: lower Brier (with AUC + LogLoss shown)")
lines.append(" - Regression: lower RMSE (with MAE + R2 shown)")
lines.append(" - Survival: higher C-index")
lines.append("")
lines.append("### Leaders")
lines.append("")
lines.append("| Task | Winner | Primary Metric | Value |")
lines.append("|---|---|---|---:|")
for fam in ("binomial", "gaussian", "survival"):
winner = winner_for_family(fam)
if winner is None:
lines.append(f"| {family_labels.get(fam, fam)} | — | — | — |")
continue
m_name, m_val, _ = primary_metric(winner)
lines.append(
f"| {family_labels.get(fam, fam)} | "
f"{md_escape(winner.get('contender'))} ({md_escape(winner.get('scenario_name'))}) | "
f"{m_name} | {fmt_num(m_val)} |"
)
lines.append("")
lines.append("### Full Table")
lines.append("")
lines.append("| Task | Scenario | Contender | Status | C-index/AUC | Brier | LogLoss | RMSE | MAE | R2 | Fit (s) | Predict (s) |")
lines.append("|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|")
for r in merged_sorted:
fam = str(r.get("family") or "unknown")
lines.append(
"| "
+ " | ".join(
[
md_escape(family_labels.get(fam, fam)),
md_escape(r.get("scenario_name")),
md_escape(r.get("contender")),
md_escape(fmt_status(r)),
fmt_num(r.get("auc")),
fmt_num(r.get("brier")),
fmt_num(r.get("logloss")),
fmt_num(r.get("rmse")),
fmt_num(r.get("mae")),
fmt_num(r.get("r2")),
fmt_num(r.get("fit_sec")),
fmt_num(r.get("predict_sec")),
]
)
+ " |"
)
lines.append("")
lines.append("### Model Specs")
lines.append("")
lines.append("| Scenario | Contender | Model Spec |")
lines.append("|---|---|---|")
for r in merged_sorted:
lines.append(
f"| {md_escape(r.get('scenario_name'))} | {md_escape(r.get('contender'))} | {md_escape(r.get('model_spec'))} |"
)
Path(summary_path).write_text("\n".join(lines) + "\n")
PY
- name: Upload merged nightly results
uses: actions/upload-artifact@v4
with:
name: benchmark-results-nightly
path: bench/results.nightly.json
if-no-files-found: error
- name: Generate merged figures
shell: bash
run: |
set -euo pipefail
python3 -m pip install --quiet matplotlib numpy pandas
python3 bench/generate_figures.py bench/results.nightly.json bench/figures bench/figures.zip
- name: Upload benchmark figures
uses: actions/upload-artifact@v4
with:
name: benchmark-figures
path: bench/figures.zip
if-no-files-found: warn