colconv 0.1.0

SIMD-dispatched color-conversion kernels covering the FFmpeg AVPixelFormat space, with a Sink-based API so consumers pick which derived outputs (RGB / Luma / HSV / custom) they want without paying for the ones they don't.
Documentation
name: Benchmarks

on:
  push:
    branches:
      - main
    paths:
      - 'benches/**'
      - 'src/**'
      - 'Cargo.toml'
      - 'Cargo.lock'
      - '.github/workflows/benchmark.yml'
  pull_request:
    paths:
      - 'benches/**'
      - 'src/**'
      - 'Cargo.toml'
      - 'Cargo.lock'
      - '.github/workflows/benchmark.yml'
  workflow_dispatch:

env:
  CARGO_TERM_COLOR: always
  RUST_BACKTRACE: 1

# `contents: read` is sufficient — we no longer comment on PRs.
# Reviewers download the Criterion artifacts manually from the workflow
# run page.
permissions:
  contents: read

jobs:
  benchmark:
    name: ${{ matrix.label }}
    strategy:
      matrix:
        include:
          # aarch64 NEON — runtime dispatcher picks NEON; scalar variant in
          # each bench exercised via `use_simd=false`.
          - os: macos-latest
            arch: aarch64
            tier: neon
            rustflags: ''
            label: macos-aarch64-neon

          # aarch64 with NEON short-circuited via `colconv_force_scalar`:
          # dispatcher takes the scalar path on every call, producing a
          # scalar baseline that matches the one measured inside the
          # `use_simd=false` bench variant but with dispatcher branches
          # also uncovered-then-covered for coverage fidelity.
          - os: macos-latest
            arch: aarch64
            tier: scalar
            rustflags: '--cfg colconv_force_scalar'
            label: macos-aarch64-scalar

          # x86_64 default — runtime dispatcher picks whichever x86 tier
          # the runner supports. Standard ubuntu-latest is AMD EPYC 7763
          # (Milan) which has AVX2 but NOT AVX-512, so this tier ends up
          # exercising the AVX2 kernel in practice. Use the -avx512 row
          # below for actual AVX-512 coverage.
          - os: ubuntu-latest
            arch: x86_64
            tier: default
            rustflags: ''
            label: ubuntu-x86_64-default

          # Note: no AVX-512 bench tier. GitHub-hosted free runners are
          # AMD Milan (no AVX-512), and emulated numbers from Intel SDE
          # are ~5-10x off real hardware — not worth measuring. Test
          # correctness of the AVX-512 kernel is covered by the
          # `test-sde` job in ci.yml instead.

          # x86_64 with AVX-512 disabled: forces the AVX2 dispatch branch
          # on runners that would otherwise always pick AVX-512. Gives
          # explicit AVX2-tier numbers regardless of runner CPU.
          - os: ubuntu-latest
            arch: x86_64
            tier: avx2-max
            rustflags: '--cfg colconv_disable_avx512'
            label: ubuntu-x86_64-avx2-max

          # x86_64 with AVX-512 and AVX2 both disabled: forces the SSE4.1
          # dispatch branch. Every x86_64 CPU since ~2008 has SSE4.1, so
          # this tier exercises the SSE4.1 kernel on every runner.
          - os: ubuntu-latest
            arch: x86_64
            tier: sse41-max
            rustflags: '--cfg colconv_disable_avx512 --cfg colconv_disable_avx2'
            label: ubuntu-x86_64-sse41-max

          # x86_64 with every SIMD backend short-circuited: scalar-only
          # baseline. Complements `use_simd=false` variants inside each
          # bench (this tier also routes the dispatcher itself to scalar).
          - os: ubuntu-latest
            arch: x86_64
            tier: scalar
            rustflags: '--cfg colconv_force_scalar'
            label: ubuntu-x86_64-scalar

          # x86_64 with `-C target-cpu=native`: enables the full feature
          # set of the runner's build-time CPU for LLVM auto-vectorization
          # of scalar paths and maximum codegen quality for SIMD kernels.
          #
          # `native` uses `target_rustflags` (routed via the per-target
          # `CARGO_TARGET_*_RUSTFLAGS` env var) instead of the global
          # `rustflags` field. The global `RUSTFLAGS` applies to *every*
          # rustc invocation, including proc macro dylibs built for the
          # host (`thiserror_impl`, etc.), which then get codegen'd with
          # host-CPU-specific instructions and SIGILL when rustc loads
          # them in a different execution context. The per-target var
          # scopes `-C target-cpu=native` to the target crate only, so
          # proc macros stay generic and we still get maximum codegen
          # quality for the benchmark build.
          - os: ubuntu-latest
            arch: x86_64
            tier: native
            rustflags: ''
            target_rustflags: '-C target-cpu=native'
            label: ubuntu-x86_64-native

          # Windows x86_64 — same dispatcher as Linux but validates the
          # MSVC toolchain handles the intrinsics-heavy modules.
          - os: windows-latest
            arch: x86_64
            tier: default
            rustflags: ''
            label: windows-x86_64-default

    runs-on: ${{ matrix.os }}
    env:
      RUSTFLAGS: ${{ matrix.rustflags }}
      # Per-target rustflags: only affects compilation of crates built
      # for the named triple, never build-dependencies (proc macros,
      # build scripts). The `native` tier uses this instead of
      # `RUSTFLAGS` to avoid SIGILL when rustc loads host-compiled
      # proc-macro dylibs that were codegen'd with `target-cpu=native`.
      # Empty for tiers that don't opt in — cargo treats empty as unset.
      CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: ${{ matrix.target_rustflags || '' }}
    steps:
      - uses: actions/checkout@v6

      - name: Install Rust
        run: rustup update stable --no-self-update && rustup default stable

      - name: Print CPU info (Linux)
        if: runner.os == 'Linux'
        shell: bash
        run: |
          echo "=== /proc/cpuinfo (first flags line) ==="
          grep -m1 '^flags' /proc/cpuinfo || true
          echo "=== lscpu ==="
          lscpu || true

      - name: Print CPU info (macOS)
        if: runner.os == 'macOS'
        shell: bash
        run: |
          echo "=== sysctl machdep.cpu ==="
          sysctl machdep.cpu || true
          echo "=== uname -m ==="
          uname -m

      - name: Print CPU info (Windows)
        if: runner.os == 'Windows'
        shell: pwsh
        run: |
          Get-CimInstance Win32_Processor | Select-Object Name, Manufacturer, NumberOfCores, NumberOfLogicalProcessors | Format-List

      - name: Cache cargo build and registry
        uses: actions/cache@v5
        with:
          path: |
            ~/.cargo/registry
            ~/.cargo/git
            target
          key: ${{ runner.os }}-bench-${{ matrix.tier }}-${{ hashFiles('**/Cargo.lock') }}
          restore-keys: |
            ${{ runner.os }}-bench-${{ matrix.tier }}-
            ${{ runner.os }}-bench-

      - name: Run benchmarks
        shell: bash
        # `--benches` limits cargo to the registered bench targets.
        # Without it, `cargo bench` also runs the library's `#[test]`
        # harness in release mode. Uses Criterion's native output
        # format (per-bench `time:` / `thrpt:` blocks with confidence
        # intervals) — the full HTML report lives in `target/criterion/`
        # and is uploaded separately below.
        run: cargo bench --benches | tee benchmark-all-${{ matrix.label }}.txt
        continue-on-error: false

      - name: Write run metadata
        shell: bash
        # Small metadata file alongside the raw Criterion output so
        # downloaded archives are self-describing (which runner / tier /
        # flags produced these numbers). Full per-bench results live in
        # `benchmark-all-${LABEL}.txt` and the HTML report under
        # `target/criterion/` — both uploaded below.
        run: |
          meta="benchmark-metadata-${{ matrix.label }}.md"
          echo "# Benchmark metadata: ${{ matrix.label }}" > "$meta"
          echo "" >> "$meta"
          echo "- OS: ${{ matrix.os }}" >> "$meta"
          echo "- Arch: ${{ matrix.arch }}" >> "$meta"
          echo "- SIMD tier: ${{ matrix.tier }}" >> "$meta"
          echo "- Runner: ${{ runner.name }}" >> "$meta"
          echo "- Runner arch (GH): ${{ runner.arch }}" >> "$meta"
          echo "- RUSTFLAGS: \`${{ matrix.rustflags }}\`" >> "$meta"
          if [ -n "${{ matrix.target_rustflags || '' }}" ]; then
            echo "- CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: \`${{ matrix.target_rustflags || '' }}\`" >> "$meta"
          fi
          echo "- Date: $(date -u +"%Y-%m-%d %H:%M:%S UTC")" >> "$meta"
          cat "$meta"

      - name: Create benchmark archive
        shell: bash
        run: |
          mkdir -p benchmark-results
          mv benchmark-*.txt benchmark-results/ 2>/dev/null || true
          mv benchmark-metadata-${{ matrix.label }}.md benchmark-results/ 2>/dev/null || true
          if [ -d "target/criterion" ]; then
            cp -r target/criterion benchmark-results/criterion-${{ matrix.label }} || true
          fi

      - name: Upload benchmark results
        uses: actions/upload-artifact@v7
        with:
          name: benchmark-results-${{ matrix.label }}
          path: benchmark-results/
          retention-days: 90

      - name: Upload Criterion detailed results
        uses: actions/upload-artifact@v7
        if: always()
        with:
          name: criterion-detailed-${{ matrix.label }}
          path: target/criterion/
          retention-days: 90
        continue-on-error: false

  # Aggregate results from all platforms and SIMD tiers.
  aggregate-results:
    name: Aggregate benchmark results
    needs: benchmark
    runs-on: ubuntu-latest
    if: always()
    steps:
      - name: Download all benchmark results
        uses: actions/download-artifact@v8
        with:
          path: all-results

      - name: Write combined index
        shell: bash
        # Small top-level index listing the matrix entries and their
        # metadata so the combined archive is self-describing. Full
        # Criterion output (txt + HTML report) lives under
        # `all-results/` per-matrix subdirectories.
        run: |
          index="BENCHMARK_INDEX.md"
          echo "# Benchmark run: $(date -u +"%Y-%m-%d %H:%M:%S UTC")" > "$index"
          echo "" >> "$index"
          echo "Raw Criterion output per matrix entry is under" >> "$index"
          echo "\`all-results/benchmark-results-<label>/\` (txt + HTML report)." >> "$index"
          echo "" >> "$index"

          for meta in all-results/benchmark-results-*/benchmark-metadata-*.md; do
            if [ -f "$meta" ]; then
              echo "" >> "$index"
              cat "$meta" >> "$index"
              echo "" >> "$index"
              echo "---" >> "$index"
            fi
          done

          cat "$index"

      - name: Upload combined results
        uses: actions/upload-artifact@v7
        with:
          name: benchmark-results-combined
          path: |
            BENCHMARK_INDEX.md
            all-results/
          retention-days: 90