gam 0.1.17 - Docs.rs

name: Benchmark Suite

on:
  push:
    branches:
      - main
  pull_request:
    branches:
      - main
  workflow_dispatch:
  schedule:
    - cron: "0 4 * * *"

concurrency:
  group: benchmark-suite-${{ github.ref }}
  cancel-in-progress: true

permissions:
  actions: read
  contents: read

env:
  CARGO_TERM_COLOR: always
  PIP_DISABLE_PIP_VERSION_CHECK: "1"
  PIP_NO_PYTHON_VERSION_WARNING: "1"
  R_LIBS_USER: ${{ github.workspace }}/.r-lib
  BENCH_R_VERSION: "4.4.3"
  BENCH_RUNTIME_CACHE_VERSION: v2

jobs:
  prepare:
    name: Prepare benchmark matrix
    runs-on: ubuntu-latest
    outputs:
      parallel_matrix: ${{ steps.matrix.outputs.parallel_matrix }}
      parallel_count: ${{ steps.matrix.outputs.parallel_count }}
      serial_matrix: ${{ steps.matrix.outputs.serial_matrix }}
      serial_count: ${{ steps.matrix.outputs.serial_count }}
    steps:
      - name: Checkout
        uses: actions/checkout@v5

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"

      - name: Install Python deps for preflight
        shell: bash
        run: |
          set -euo pipefail
          python3 -m pip install --upgrade pip setuptools wheel
          python3 -m pip install numpy pandas scikit-learn lifelines

      - name: Preflight scenario schemas
        shell: bash
        run: |
          set -euo pipefail
          python3 - <<'PY'
          import json
          import importlib.util
          from pathlib import Path

          mod_path = Path("bench/run_suite.py").resolve()
          spec = importlib.util.spec_from_file_location("run_suite_mod", mod_path)
          mod = importlib.util.module_from_spec(spec)
          spec.loader.exec_module(mod)

          cfg = json.loads(Path("bench/scenarios.json").read_text())
          scenarios = cfg.get("scenarios", [])
          if not scenarios:
              raise SystemExit("No benchmark scenarios found in bench/scenarios.json")

          for s in scenarios:
              mod.dataset_for_scenario(s)
          print(f"validated {len(scenarios)} scenario dataset schemas")
          PY

      - name: Preflight geo_subpop16 simulation
        shell: bash
        run: |
          set -euo pipefail
          python3 - <<'PY'
          import json
          import importlib.util
          from pathlib import Path

          mod_path = Path("bench/run_suite.py").resolve()
          spec = importlib.util.spec_from_file_location("run_suite_mod", mod_path)
          mod = importlib.util.module_from_spec(spec)
          spec.loader.exec_module(mod)

          cfg = json.loads(Path("bench/scenarios.json").read_text())
          scenarios = cfg.get("scenarios", [])
          geo_subpop = [s for s in scenarios if str(s.get("name", "")).startswith("geo_subpop16_")]
          if not geo_subpop:
              raise SystemExit("No geo_subpop16 scenarios found in bench/scenarios.json")

          for s in geo_subpop:
              mod.dataset_for_scenario(s)
          print(f"validated geo_subpop16 simulation for {len(geo_subpop)} scenarios")
          PY

      - name: Build scenario matrix
        id: matrix
        shell: bash
        run: |
          python3 - <<'PY'
          import json
          import os
          import re
          from pathlib import Path

          SERIAL_SCENARIOS = {
              "icu_survival_death",
              "cirrhosis_survival",
          }

          def choose_single_knot(variants):
              # Prefer k12 when present for comparability across families.
              by_k = {v["k"]: v for v in variants}
              if 12 in by_k:
                  return by_k[12]["scenario"]
              ordered = sorted(variants, key=lambda v: v["k"])
              return ordered[len(ordered) // 2]["scenario"]

          cfg = json.loads(Path("bench/scenarios.json").read_text())
          scenarios = cfg.get("scenarios", [])
          names = [s["name"] for s in scenarios if "name" in s]
          if not names:
              raise SystemExit("No benchmark scenarios found in bench/scenarios.json")

          event_name = os.environ.get("GITHUB_EVENT_NAME", "").strip().lower()
          is_nightly = event_name == "schedule"

          if is_nightly:
              # Nightly: run full matrix (all knot variants).
              selected = names
          else:
              # Push/manual: run one representative knot per *_kN family.
              k_pat = re.compile(r"^(?P<base>.+)_k(?P<k>\d+)$")
              families = {}
              selected = []
              for name in names:
                  m = k_pat.match(name)
                  if not m:
                      selected.append(name)
                      continue
                  base = m.group("base")
                  families.setdefault(base, []).append(
                      {"scenario": name, "k": int(m.group("k"))}
                  )
              for base in sorted(families):
                  selected.append(choose_single_knot(families[base]))

          include = [{"scenario": name} for name in selected]
          if not include:
              raise SystemExit("No benchmark scenarios found in bench/scenarios.json")
          serial_include = [row for row in include if row["scenario"] in SERIAL_SCENARIOS]
          parallel_include = [row for row in include if row["scenario"] not in SERIAL_SCENARIOS]
          parallel_matrix = {"include": parallel_include}
          serial_matrix = {"include": serial_include}
          out = Path(os.environ["GITHUB_OUTPUT"])
          with out.open("a", encoding="utf-8") as fh:
              fh.write(f"parallel_matrix={json.dumps(parallel_matrix)}\n")
              fh.write(f"parallel_count={len(parallel_include)}\n")
              fh.write(f"serial_matrix={json.dumps(serial_matrix)}\n")
              fh.write(f"serial_count={len(serial_include)}\n")
          print(
              json.dumps(
                  {
                      "event": event_name,
                      "nightly_full_matrix": is_nightly,
                      "scenario_count": len(selected),
                      "parallel_count": len(parallel_include),
                      "serial_count": len(serial_include),
                      "parallel_matrix": parallel_matrix,
                      "serial_matrix": serial_matrix,
                  }
              )
          )
          PY

  bootstrap-runtime:
    name: Bootstrap benchmark runtime
    needs: prepare
    runs-on: ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v5
        with:
          fetch-depth: 1

      - name: Restore shared runtime cache
        id: runtime-cache
        uses: actions/cache/restore@v4
        with:
          path: |
            bench/runtime/pydeps
            bench/runtime/r-lib
          key: ${{ runner.os }}-bench-runtime-${{ env.BENCH_RUNTIME_CACHE_VERSION }}-${{ hashFiles('.github/actions/bench-setup/action.yml', '.github/workflows/benchmark.yml') }}
          restore-keys: |
            ${{ runner.os }}-bench-runtime-${{ env.BENCH_RUNTIME_CACHE_VERSION }}-

      - name: Shared benchmark setup (cached)
        if: ${{ steps.runtime-cache.outputs.cache-hit != 'true' }}
        uses: ./.github/actions/bench-setup
        with:
          install-runtime-deps: "true"
          r-version: ${{ env.BENCH_R_VERSION }}

      - name: Install Rust stable
        uses: dtolnay/rust-toolchain@stable

      - name: Cache cargo artifacts
        uses: Swatinem/rust-cache@v2

      - name: Build shared runtime
        shell: bash
        run: |
          set -euo pipefail
          mkdir -p bench/runtime
          if [ "${{ steps.runtime-cache.outputs.cache-hit }}" != "true" ]; then
            mkdir -p bench/runtime/pydeps bench/runtime/r-lib
            if [ -d "${R_LIBS_USER}" ]; then
              cp -a "${R_LIBS_USER}/." bench/runtime/r-lib/ || true
            fi
            python3 -m pip install --upgrade pip setuptools wheel
            python3 -m pip install --target bench/runtime/pydeps numpy pandas scikit-learn lifelines scikit-survival xgboost matplotlib
          fi
          cargo build --release --bin gam
          cp target/release/gam bench/runtime/gam
          chmod +x bench/runtime/gam

      - name: Save shared runtime cache
        if: ${{ steps.runtime-cache.outputs.cache-hit != 'true' }}
        uses: actions/cache/save@v4
        with:
          path: |
            bench/runtime/pydeps
            bench/runtime/r-lib
          key: ${{ steps.runtime-cache.outputs.cache-primary-key }}

      - name: Upload shared runtime artifact
        uses: actions/upload-artifact@v4
        with:
          name: bench-runtime
          path: bench/runtime
          if-no-files-found: error

  bench-shard:
    name: Bench ${{ matrix.scenario }}
    needs:
      - prepare
      - bootstrap-runtime
    if: ${{ needs.prepare.outputs.parallel_count != '0' }}
    runs-on: ubuntu-latest
    # GH ceiling is a safety net only — the bench command is wrapped in
    # GNU `timeout` set a few minutes shorter so a timeout produces a
    # workflow `failure` (not `cancelled`). See "running benchmark shard"
    # below.
    timeout-minutes: 47
    strategy:
      fail-fast: false
      max-parallel: 8
      matrix: ${{ fromJSON(needs.prepare.outputs.parallel_matrix) }}
    env:
      BENCH_CI_PROFILE: ${{ github.event_name == 'schedule' && 'full' || 'lean' }}
      BENCH_RAYON_THREADS: "4"
      BENCH_BLAS_THREADS: "1"
      BENCH_CMD_TIMEOUT_SEC: "10800"
    steps:
      - name: Checkout
        uses: actions/checkout@v5
        with:
          fetch-depth: 1

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"

      - name: Set up R
        uses: ./.github/actions/setup-bench-r
        with:
          r-version: ${{ env.BENCH_R_VERSION }}

      - name: Download shared runtime
        shell: bash
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          set -euo pipefail
          python3 - <<'PY'
          import json
          import subprocess
          import zipfile
          from pathlib import Path

          repo = "${{ github.repository }}"
          run_id = "${{ github.run_id }}"
          target_name = "bench-runtime"
          out_dir = Path("bench/runtime")
          out_dir.mkdir(parents=True, exist_ok=True)

          proc = subprocess.run(
              ["gh", "api", f"/repos/{repo}/actions/runs/{run_id}/artifacts?per_page=100"],
              check=True,
              capture_output=True,
              text=True,
          )
          artifacts = json.loads(proc.stdout).get("artifacts", [])
          match = next((a for a in artifacts if a.get("name") == target_name and not a.get("expired")), None)
          if match is None:
              raise SystemExit(f"artifact {target_name!r} not found for run {run_id}")

          zip_path = Path("bench-runtime.zip")
          with zip_path.open("wb") as fh:
              subprocess.run(
                  ["gh", "api", f"/repos/{repo}/actions/artifacts/{match['id']}/zip"],
                  check=True,
                  stdout=fh,
              )
          with zipfile.ZipFile(zip_path) as zf:
              zf.extractall(out_dir)
          zip_path.unlink()
          PY

      - name: Run scenario shard
        shell: bash
        env:
          PYTHONPATH: ${{ github.workspace }}/bench/runtime/pydeps
          R_LIBS_USER: ${{ github.workspace }}/bench/runtime/r-lib
          BENCH_GAM_BIN: ${{ github.workspace }}/bench/runtime/gam
        run: |
          set -euo pipefail
          echo "== Benchmark shard diagnostics =="
          echo "pwd: $(pwd)"
          echo "scenario: ${{ matrix.scenario }}"
          echo "python: $(command -v python3)"
          python3 --version
          echo "Rscript: $(command -v Rscript)"
          Rscript --version
          echo "BENCH_GAM_BIN=${BENCH_GAM_BIN}"
          echo "BENCH_CI_PROFILE=${BENCH_CI_PROFILE}"
          echo "BENCH_RAYON_THREADS=${BENCH_RAYON_THREADS}"
          echo "BENCH_BLAS_THREADS=${BENCH_BLAS_THREADS}"
          echo "BENCH_CMD_TIMEOUT_SEC=${BENCH_CMD_TIMEOUT_SEC}"
          echo "PYTHONPATH=${PYTHONPATH}"
          echo "R_LIBS_USER=${R_LIBS_USER}"
          echo "bench/runtime tree:"
          ls -la bench/runtime || true
          ls -la bench/runtime/pydeps | sed -n '1,40p' || true
          ls -la bench/runtime/r-lib | sed -n '1,40p' || true
          echo "gam binary before chmod:"
          ls -l "${BENCH_GAM_BIN}" || true
          file "${BENCH_GAM_BIN}" || true
          chmod +x "${BENCH_GAM_BIN}" || true
          echo "gam binary after chmod:"
          ls -l "${BENCH_GAM_BIN}" || true
          file "${BENCH_GAM_BIN}" || true
          test -x "${BENCH_GAM_BIN}"
          echo "smoke-import python deps"
          python3 - <<'PY'
          import importlib.util
          from pathlib import Path
          import numpy, pandas, lifelines, sksurv, xgboost

          scenario_name = "${{ matrix.scenario }}"
          mod_path = Path("bench/run_suite.py").resolve()
          spec = importlib.util.spec_from_file_location("run_suite_mod", mod_path)
          mod = importlib.util.module_from_spec(spec)
          spec.loader.exec_module(mod)

          print(f"python deps ok (scenario={scenario_name})")
          PY
          echo "smoke-load R deps"
          Rscript -e 'library(mgcv); library(jsonlite); library(survival); library(glmnet); library(gamlss); library(gamlss.dist); library(gamboostLSS); library(mboost); library(bamlss); library(brms); cat("R deps ok\n")'
          mkdir -p bench/results-shards
          echo "running benchmark shard..."
          # Wrap the bench command in GNU timeout so an over-budget shard
          # FAILS (exit 124 / 137) rather than being cancelled by the GH
          # job-level timeout-minutes ceiling. We set the GNU timeout
          # several minutes shorter than that ceiling so the shard exits
          # cleanly before GH would auto-cancel; the explicit `exit 1`
          # below ensures the conclusion is `failure`, not `cancelled`.
          BENCH_SHARD_TIMEOUT="${BENCH_SHARD_TIMEOUT:-42m}"
          set +e
          timeout --signal=TERM --kill-after=30s "${BENCH_SHARD_TIMEOUT}" \
            python3 bench/run_suite.py \
              --scenarios bench/scenarios.json \
              --scenario-name "${{ matrix.scenario }}" \
              --out "bench/results-shards/${{ matrix.scenario }}.json"
          rc=$?
          set -e
          if [ "${rc}" -ne 0 ]; then
            if [ "${rc}" = "124" ] || [ "${rc}" = "137" ]; then
              echo "::error title=Benchmark shard timeout::Scenario ${{ matrix.scenario }} exceeded ${BENCH_SHARD_TIMEOUT} wall budget (exit ${rc}); failing the job rather than being cancelled by the GH job-level ceiling."
            else
              echo "::error title=Benchmark shard error::Scenario ${{ matrix.scenario }} failed with exit ${rc}."
            fi
            exit 1
          fi
          echo "shard output:"
          ls -l "bench/results-shards/${{ matrix.scenario }}.json"

      - name: Upload shard artifact
        uses: actions/upload-artifact@v4
        with:
          name: bench-${{ matrix.scenario }}
          path: |
            bench/results-shards/${{ matrix.scenario }}.json
            bench/results-shards/figures/${{ matrix.scenario }}.png
          if-no-files-found: error

  bench-shard-serial:
    name: Bench ${{ matrix.scenario }} (serial)
    needs:
      - prepare
      - bootstrap-runtime
    if: ${{ needs.prepare.outputs.serial_count != '0' }}
    runs-on: ubuntu-24.04
    # GH ceiling is a safety net only — the bench command is wrapped in
    # GNU `timeout` set a few minutes shorter so a timeout produces a
    # workflow `failure` (not `cancelled`). See "running benchmark shard"
    # below.
    timeout-minutes: 47
    strategy:
      fail-fast: false
      max-parallel: 1
      matrix: ${{ fromJSON(needs.prepare.outputs.serial_matrix) }}
    env:
      BENCH_CI_PROFILE: ${{ github.event_name == 'schedule' && 'full' || 'lean' }}
      BENCH_RAYON_THREADS: "4"
      BENCH_BLAS_THREADS: "1"
      BENCH_CMD_TIMEOUT_SEC: "10800"
    steps:
      - name: Checkout
        uses: actions/checkout@v5
        with:
          fetch-depth: 1

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"

      - name: Set up R
        uses: ./.github/actions/setup-bench-r
        with:
          r-version: ${{ env.BENCH_R_VERSION }}

      - name: Download shared runtime
        shell: bash
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          set -euo pipefail
          python3 - <<'PY'
          import json
          import subprocess
          import zipfile
          from pathlib import Path

          repo = "${{ github.repository }}"
          run_id = "${{ github.run_id }}"
          target_name = "bench-runtime"
          out_dir = Path("bench/runtime")
          out_dir.mkdir(parents=True, exist_ok=True)

          proc = subprocess.run(
              ["gh", "api", f"/repos/{repo}/actions/runs/{run_id}/artifacts?per_page=100"],
              check=True,
              capture_output=True,
              text=True,
          )
          artifacts = json.loads(proc.stdout).get("artifacts", [])
          match = next((a for a in artifacts if a.get("name") == target_name and not a.get("expired")), None)
          if match is None:
              raise SystemExit(f"artifact {target_name!r} not found for run {run_id}")

          zip_path = Path("bench-runtime.zip")
          with zip_path.open("wb") as fh:
              subprocess.run(
                  ["gh", "api", f"/repos/{repo}/actions/artifacts/{match['id']}/zip"],
                  check=True,
                  stdout=fh,
              )
          with zipfile.ZipFile(zip_path) as zf:
              zf.extractall(out_dir)
          zip_path.unlink()
          PY

      - name: Run scenario shard
        shell: bash
        env:
          PYTHONPATH: ${{ github.workspace }}/bench/runtime/pydeps
          R_LIBS_USER: ${{ github.workspace }}/bench/runtime/r-lib
          BENCH_GAM_BIN: ${{ github.workspace }}/bench/runtime/gam
        run: |
          set -euo pipefail
          echo "== Benchmark shard diagnostics =="
          echo "pwd: $(pwd)"
          echo "scenario: ${{ matrix.scenario }}"
          echo "python: $(command -v python3)"
          python3 --version
          echo "Rscript: $(command -v Rscript)"
          Rscript --version
          echo "BENCH_GAM_BIN=${BENCH_GAM_BIN}"
          echo "BENCH_CI_PROFILE=${BENCH_CI_PROFILE}"
          echo "BENCH_RAYON_THREADS=${BENCH_RAYON_THREADS}"
          echo "BENCH_BLAS_THREADS=${BENCH_BLAS_THREADS}"
          echo "BENCH_CMD_TIMEOUT_SEC=${BENCH_CMD_TIMEOUT_SEC}"
          echo "PYTHONPATH=${PYTHONPATH}"
          echo "R_LIBS_USER=${R_LIBS_USER}"
          echo "bench/runtime tree:"
          ls -la bench/runtime || true
          ls -la bench/runtime/pydeps | sed -n '1,40p' || true
          ls -la bench/runtime/r-lib | sed -n '1,40p' || true
          echo "gam binary before chmod:"
          ls -l "${BENCH_GAM_BIN}" || true
          file "${BENCH_GAM_BIN}" || true
          chmod +x "${BENCH_GAM_BIN}" || true
          echo "gam binary after chmod:"
          ls -l "${BENCH_GAM_BIN}" || true
          file "${BENCH_GAM_BIN}" || true
          test -x "${BENCH_GAM_BIN}"
          echo "smoke-import python deps"
          python3 - <<'PY'
          import importlib.util
          from pathlib import Path
          import numpy, pandas, lifelines, sksurv, xgboost

          scenario_name = "${{ matrix.scenario }}"
          mod_path = Path("bench/run_suite.py").resolve()
          spec = importlib.util.spec_from_file_location("run_suite_mod", mod_path)
          mod = importlib.util.module_from_spec(spec)
          spec.loader.exec_module(mod)

          print(f"python deps ok (scenario={scenario_name})")
          PY
          echo "smoke-load R deps"
          Rscript -e 'library(mgcv); library(jsonlite); library(survival); library(glmnet); library(gamlss); library(gamlss.dist); library(gamboostLSS); library(mboost); library(bamlss); library(brms); cat("R deps ok\n")'
          mkdir -p bench/results-shards
          echo "running benchmark shard..."
          # Wrap the bench command in GNU timeout so an over-budget shard
          # FAILS (exit 124 / 137) rather than being cancelled by the GH
          # job-level timeout-minutes ceiling. We set the GNU timeout
          # several minutes shorter than that ceiling so the shard exits
          # cleanly before GH would auto-cancel; the explicit `exit 1`
          # below ensures the conclusion is `failure`, not `cancelled`.
          BENCH_SHARD_TIMEOUT="${BENCH_SHARD_TIMEOUT:-42m}"
          set +e
          timeout --signal=TERM --kill-after=30s "${BENCH_SHARD_TIMEOUT}" \
            python3 bench/run_suite.py \
              --scenarios bench/scenarios.json \
              --scenario-name "${{ matrix.scenario }}" \
              --out "bench/results-shards/${{ matrix.scenario }}.json"
          rc=$?
          set -e
          if [ "${rc}" -ne 0 ]; then
            if [ "${rc}" = "124" ] || [ "${rc}" = "137" ]; then
              echo "::error title=Benchmark shard timeout::Scenario ${{ matrix.scenario }} exceeded ${BENCH_SHARD_TIMEOUT} wall budget (exit ${rc}); failing the job rather than being cancelled by the GH job-level ceiling."
            else
              echo "::error title=Benchmark shard error::Scenario ${{ matrix.scenario }} failed with exit ${rc}."
            fi
            exit 1
          fi
          echo "shard output:"
          ls -l "bench/results-shards/${{ matrix.scenario }}.json"

      - name: Upload shard artifact
        uses: actions/upload-artifact@v4
        with:
          name: bench-${{ matrix.scenario }}
          path: |
            bench/results-shards/${{ matrix.scenario }}.json
            bench/results-shards/figures/${{ matrix.scenario }}.png
          if-no-files-found: error

  aggregate:
    name: Aggregate benchmark results
    needs:
      - bench-shard
      - bench-shard-serial
    runs-on: ubuntu-latest
    steps:
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"

      - name: Checkout for figure generation
        uses: actions/checkout@v5
        with:
          fetch-depth: 1
          sparse-checkout: bench

      - name: Download all shard artifacts
        shell: bash
        env:
          GH_TOKEN: ${{ github.token }}
        run: |
          set -euo pipefail
          python3 - <<'PY'
          import json
          import subprocess
          import zipfile
          from pathlib import Path

          repo = "${{ github.repository }}"
          run_id = "${{ github.run_id }}"
          root = Path("bench/artifacts")
          root.mkdir(parents=True, exist_ok=True)

          proc = subprocess.run(
              ["gh", "api", f"/repos/{repo}/actions/runs/{run_id}/artifacts?per_page=100"],
              check=True,
              capture_output=True,
              text=True,
          )
          artifacts = json.loads(proc.stdout).get("artifacts", [])
          shard_artifacts = [
              a for a in artifacts
              if str(a.get("name", "")).startswith("bench-")
              and str(a.get("name", "")) != "bench-runtime"
              and not a.get("expired")
          ]
          if not shard_artifacts:
              raise SystemExit("No shard artifacts found to merge.")

          for artifact in shard_artifacts:
              artifact_dir = root / str(artifact["name"])
              artifact_dir.mkdir(parents=True, exist_ok=True)
              zip_path = artifact_dir.with_suffix(".zip")
              with zip_path.open("wb") as fh:
                  subprocess.run(
                      ["gh", "api", f"/repos/{repo}/actions/artifacts/{artifact['id']}/zip"],
                      check=True,
                      stdout=fh,
                  )
              with zipfile.ZipFile(zip_path) as zf:
                  zf.extractall(artifact_dir)
              zip_path.unlink()
          PY

      - name: Merge shard JSON outputs
        shell: bash
        run: |
          python3 - <<'PY'
          import json
          import os
          from datetime import datetime, timezone
          from pathlib import Path

          def fmt_num(v, digits=4):
              if v is None:
                  return "—"
              try:
                  return f"{float(v):.{digits}f}"
              except Exception:
                  return "—"

          def fmt_status(row):
              status = str(row.get("status", "unknown"))
              if status == "ok":
                  return "ok"
              return f"failed: {row.get('error', 'unknown error')}"

          def md_escape(v):
              if v is None:
                  return "—"
              s = str(v).replace("\n", " ").strip()
              return s.replace("|", "\\|")

          artifacts = Path("bench/artifacts")
          files = sorted(artifacts.rglob("*.json"))
          if not files:
              raise SystemExit("No shard artifacts found to merge.")

          merged = []
          for fp in files:
              payload = json.loads(fp.read_text())
              merged.extend(payload.get("results", []))

          merged_sorted = sorted(
              merged,
              key=lambda r: (
                  str(r.get("scenario_name", "")),
                  str(r.get("contender", "")),
              ),
          )

          out = Path("bench/results.nightly.json")
          out.write_text(
              json.dumps(
                  {
                      "created_at_utc": datetime.now(timezone.utc).isoformat(),
                      "results": merged,
                  },
                  indent=2,
              )
          )
          print(f"Wrote {out} from {len(files)} shard files with {len(merged)} rows.")

          summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
          if summary_path:
              ok_count = sum(1 for r in merged_sorted if str(r.get("status", "")) == "ok")
              fail_count = len(merged_sorted) - ok_count
              family_labels = {"binomial": "Classification", "gaussian": "Regression", "survival": "Survival"}
              family_rows = {}
              for r in merged_sorted:
                  fam = str(r.get("family") or "unknown")
                  family_rows.setdefault(fam, []).append(r)

              def primary_metric(row):
                  fam = str(row.get("family") or "")
                  if fam == "binomial":
                      return "Brier", row.get("brier"), "min"
                  if fam == "gaussian":
                      return "RMSE", row.get("rmse"), "min"
                  if fam == "survival":
                      return "C-index", row.get("auc"), "max"
                  return "n/a", None, "min"

              def winner_for_family(fam):
                  rows = [r for r in family_rows.get(fam, []) if str(r.get("status", "")) == "ok"]
                  if not rows:
                      return None
                  vals = []
                  for r in rows:
                      _, v, mode = primary_metric(r)
                      if v is None:
                          continue
                      vals.append((float(v), r, mode))
                  if not vals:
                      return None
                  mode = vals[0][2]
                  return (min(vals, key=lambda x: x[0]) if mode == "min" else max(vals, key=lambda x: x[0]))[1]

              lines = []
              lines.append("## Benchmark Results")
              lines.append("")
              lines.append(f"- Generated: `{datetime.now(timezone.utc).isoformat()}`")
              lines.append(f"- Shards merged: `{len(files)}`")
              lines.append(f"- Result rows: `{len(merged_sorted)}` (`{ok_count}` ok, `{fail_count}` failed)")
              lines.append("- Primary metrics by task:")
              lines.append("  - Classification: lower Brier (with AUC + LogLoss shown)")
              lines.append("  - Regression: lower RMSE (with MAE + R2 shown)")
              lines.append("  - Survival: higher C-index")
              lines.append("")
              lines.append("### Leaders")
              lines.append("")
              lines.append("| Task | Winner | Primary Metric | Value |")
              lines.append("|---|---|---|---:|")
              for fam in ("binomial", "gaussian", "survival"):
                  winner = winner_for_family(fam)
                  if winner is None:
                      lines.append(f"| {family_labels.get(fam, fam)} | — | — | — |")
                      continue
                  m_name, m_val, _ = primary_metric(winner)
                  lines.append(
                      f"| {family_labels.get(fam, fam)} | "
                      f"{md_escape(winner.get('contender'))} ({md_escape(winner.get('scenario_name'))}) | "
                      f"{m_name} | {fmt_num(m_val)} |"
                  )
              lines.append("")
              lines.append("### Full Table")
              lines.append("")
              lines.append("| Task | Scenario | Contender | Status | C-index/AUC | Brier | LogLoss | RMSE | MAE | R2 | Fit (s) | Predict (s) |")
              lines.append("|---|---|---|---|---:|---:|---:|---:|---:|---:|---:|---:|")
              for r in merged_sorted:
                  fam = str(r.get("family") or "unknown")
                  lines.append(
                      "| "
                      + " | ".join(
                          [
                              md_escape(family_labels.get(fam, fam)),
                              md_escape(r.get("scenario_name")),
                              md_escape(r.get("contender")),
                              md_escape(fmt_status(r)),
                              fmt_num(r.get("auc")),
                              fmt_num(r.get("brier")),
                              fmt_num(r.get("logloss")),
                              fmt_num(r.get("rmse")),
                              fmt_num(r.get("mae")),
                              fmt_num(r.get("r2")),
                              fmt_num(r.get("fit_sec")),
                              fmt_num(r.get("predict_sec")),
                          ]
                      )
                      + " |"
                  )
              lines.append("")
              lines.append("### Model Specs")
              lines.append("")
              lines.append("| Scenario | Contender | Model Spec |")
              lines.append("|---|---|---|")
              for r in merged_sorted:
                  lines.append(
                      f"| {md_escape(r.get('scenario_name'))} | {md_escape(r.get('contender'))} | {md_escape(r.get('model_spec'))} |"
                  )
              Path(summary_path).write_text("\n".join(lines) + "\n")
          PY

      - name: Upload merged nightly results
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-results-nightly
          path: bench/results.nightly.json
          if-no-files-found: error

      - name: Generate merged figures
        shell: bash
        run: |
          set -euo pipefail
          python3 -m pip install --quiet matplotlib numpy pandas
          python3 bench/generate_figures.py bench/results.nightly.json bench/figures bench/figures.zip

      - name: Upload benchmark figures
        uses: actions/upload-artifact@v4
        with:
          name: benchmark-figures
          path: bench/figures.zip
          if-no-files-found: warn