name: Performance Regression Testing
on:
workflow_dispatch:
pull_request:
branches:
- main
paths:
- "src/**"
- "benches/**"
- "Cargo.toml"
- "Cargo.lock"
push:
branches:
- main
paths:
- "src/**"
- "benches/**"
- "Cargo.toml"
- "Cargo.lock"
permissions:
contents: read
actions: read
pull-requests: read
concurrency:
group: perf-regress-${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
CARGO_TERM_COLOR: always
RUST_BACKTRACE: 1
BENCHMARK_TIMEOUT: 7200 DELAUNAY_BENCH_SEED_SEARCH_LIMIT: 4096
jobs:
performance-regression:
runs-on: macos-15
timeout-minutes: 135
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd with:
fetch-depth: 0
- name: Install Rust toolchain
uses: actions-rust-lang/setup-rust-toolchain@a0b538fa0b742a6aa35d6e2c169b4bd06d225a98 with:
cache: true
- name: Install uv (Python package manager)
uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 with:
version: "latest"
- name: Verify uv installation
run: uv --version
- name: Find baseline artifact (latest semver tag baseline)
id: find_baseline
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd with:
script: |
try {
const prefix = 'performance-baseline-';
// Accept both legacy dotful names (vX.Y.Z) and dotless names (vX_Y_Z).
const semverRe = /^performance-baseline-v(\d+)[._](\d+)[._](\d+)(?:[._-].*)?$/;
const parseSemver = (name) => {
const m = name.match(semverRe);
if (!m) return null;
return [parseInt(m[1], 10), parseInt(m[2], 10), parseInt(m[3], 10)];
};
const compareSemver = (a, b) => {
for (let i = 0; i < 3; i += 1) {
if (a[i] !== b[i]) return a[i] - b[i];
}
return 0;
};
// Fetch successful generate-baseline.yml runs (most recent first)
// Note: Each run requires a follow-up listWorkflowRunArtifacts call, so keep this cap conservative
// to avoid slowdowns and API rate limits.
const MAX_RUNS = 50;
console.log(`Fetching recent generate-baseline.yml runs (up to ${MAX_RUNS})...`);
let count = 0;
const runs = await github.paginate(
github.rest.actions.listWorkflowRuns,
{
owner: context.repo.owner,
repo: context.repo.repo,
workflow_id: 'generate-baseline.yml',
status: 'completed',
conclusion: 'success',
per_page: 100
},
(response, done) => {
// Limit to MAX_RUNS runs total across pages (no overshoot)
const remaining = Math.max(0, MAX_RUNS - count);
if (remaining === 0) { done(); return []; }
const slice = response.data.slice(0, remaining);
count += slice.length;
if (count >= MAX_RUNS) done();
return slice;
}
);
console.log(`Found ${runs.length} successful generate-baseline runs`);
// Build artifact cache: artifact name → {run_id, run_created_at}
// Note: We keep the *newest* run for a given artifact name (runs are newest-first).
const artifactCache = new Map();
for (const run of runs) {
try {
const artifacts = await github.rest.actions.listWorkflowRunArtifacts({
owner: context.repo.owner,
repo: context.repo.repo,
run_id: run.id
});
for (const artifact of artifacts.data.artifacts) {
if (artifact.expired === true) continue;
if (!artifact.name.startsWith(prefix)) continue;
if (!artifactCache.has(artifact.name)) {
artifactCache.set(artifact.name, {
run_id: run.id,
run_created_at: run.created_at
});
}
}
} catch (error) {
console.log(`Warning: Could not fetch artifacts for run ${run.id}: ${error.message}`);
continue;
}
}
console.log(`Built cache of ${artifactCache.size} baseline artifacts`);
// Prefer the highest semver tag baseline present in the cache.
let best = null;
for (const [artifactName, artifactInfo] of artifactCache.entries()) {
const ver = parseSemver(artifactName);
if (!ver) continue;
const tag = artifactName.slice(prefix.length);
if (!best || compareSemver(ver, best.ver) > 0) {
best = { name: artifactName, info: artifactInfo, ver, tag };
}
}
if (best) {
console.log(`Selected baseline ${best.name} (tag ${best.tag}) from run ${best.info.run_id}`);
core.setOutput('found', 'true');
core.setOutput('artifact_name', best.name);
core.setOutput('run_id', best.info.run_id.toString());
core.setOutput('tag', best.tag);
return;
}
// Fallback: pick the most recent baseline artifact (including manual runs)
if (artifactCache.size > 0) {
let mostRecent = null;
let mostRecentTime = null;
for (const [artifactName, artifactInfo] of artifactCache.entries()) {
const runTime = new Date(artifactInfo.run_created_at);
if (!mostRecentTime || runTime > mostRecentTime) {
mostRecentTime = runTime;
mostRecent = { name: artifactName, info: artifactInfo };
}
}
if (mostRecent) {
console.log(
`Fallback: selected most recent baseline ${mostRecent.name} in run ${mostRecent.info.run_id} ` +
`(created: ${mostRecent.info.run_created_at})`
);
core.setOutput('found', 'true');
core.setOutput('artifact_name', mostRecent.name);
core.setOutput('run_id', mostRecent.info.run_id.toString());
core.setOutput('tag', mostRecent.name.slice(prefix.length));
return;
}
}
console.log('No baseline artifacts found');
core.setOutput('found', 'false');
} catch (error) {
console.error(`Error searching for baseline artifacts: ${error.message}`);
core.setOutput('found', 'false');
}
- name: Download latest baseline artifact
if: steps.find_baseline.outputs.found == 'true'
uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3 continue-on-error: true
with:
name: ${{ steps.find_baseline.outputs.artifact_name }}
path: baseline-artifact/
run-id: ${{ steps.find_baseline.outputs.run_id }}
github-token: ${{ secrets.GITHUB_TOKEN }}
- name: Prepare baseline for comparison
if: steps.find_baseline.outputs.found == 'true'
run: uv run benchmark-utils prepare-baseline
- name: Set baseline status if none found
if: steps.find_baseline.outputs.found != 'true'
run: uv run benchmark-utils set-no-baseline
- name: Skip benchmarks - no baseline available
if: env.BASELINE_EXISTS != 'true'
run: uv run benchmark-utils display-no-baseline
- name: Run performance regression test (compare vs tag baseline)
id: compare_regression
if: env.BASELINE_EXISTS == 'true'
continue-on-error: true
run: |
set -euo pipefail
# Ensure regression-summary reports this as a real run.
echo "SKIP_BENCHMARKS=false" >> "$GITHUB_ENV"
echo "SKIP_REASON=running" >> "$GITHUB_ENV"
echo " Baseline origin: ${BASELINE_ORIGIN:-unknown}"
echo " Baseline tag: ${BASELINE_TAG:-unknown}"
uv run benchmark-utils compare \
--baseline "baseline-artifact/baseline_results.txt" \
--bench-timeout "${BENCHMARK_TIMEOUT}" \
--dev
- name: Classify benchmark comparison outcome
if: env.BASELINE_EXISTS == 'true' && env.SKIP_BENCHMARKS == 'false'
run: |
set -euo pipefail
results_file="benches/compare_results.txt"
# Successful compare step => no regressions beyond configured threshold.
if [ "${{ steps.compare_regression.outcome }}" = "success" ]; then
echo "BENCHMARK_REGRESSION_DETECTED=false" >> "$GITHUB_ENV"
exit 0
fi
# Compare step failed. Distinguish "expected regression" from real benchmark errors.
if [ ! -f "$results_file" ]; then
echo "::error::Benchmark comparison failed and produced no results file."
exit 1
fi
if grep -q "❌ Error:" "$results_file"; then
echo "::error::Benchmark comparison failed due to benchmark execution error."
echo "::group::Benchmark comparison error details"
cat "$results_file"
echo "::endgroup::"
exit 1
fi
if grep -q "REGRESSION" "$results_file"; then
echo "BENCHMARK_REGRESSION_DETECTED=true" >> "$GITHUB_ENV"
warning_msg="Performance regressions detected vs baseline ${BASELINE_TAG:-unknown};"
warning_msg="${warning_msg} workflow allowed to pass by policy."
echo "::warning::${warning_msg}"
{
echo "### ⚠️ Performance Regression Detected"
echo ""
echo "- Baseline tag: \`${BASELINE_TAG:-unknown}\`"
echo "- Policy: regressions are warning-only in this workflow."
echo "- See uploaded artifact \`performance-regression-results-${{ github.run_number }}\`"
echo " and logs for details."
} >> "$GITHUB_STEP_SUMMARY"
exit 0
fi
echo "::error::Benchmark comparison failed for an unknown reason."
echo "::group::Benchmark comparison output"
cat "$results_file"
echo "::endgroup::"
exit 1
- name: Display regression test results
if: env.BASELINE_EXISTS == 'true' && env.SKIP_BENCHMARKS == 'false' && always()
run: uv run benchmark-utils display-results
- name: Upload regression test results
if: env.BASELINE_EXISTS == 'true' && env.SKIP_BENCHMARKS == 'false' && always()
uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f with:
name: performance-regression-results-${{ github.run_number }}
path: |
benches/compare_results.txt
baseline-artifact/baseline_results.txt
if-no-files-found: warn
retention-days: 30
- name: Summary
if: always()
run: uv run benchmark-utils regression-summary