memspan 0.1.0

SIMD-accelerated byte-class scanning for lexers and parsers. Backends: AVX-512, AVX2, SSE4.1, NEON, WASM SIMD128. no_std compatible.
Documentation
name: Benchmarks

on:
  push:
    branches:
      - main
    paths:
      - 'benches/**'
      - 'src/**'
      - 'Cargo.toml'
      - 'Cargo.lock'
      - '.github/workflows/benchmark.yml'
  pull_request:
    paths:
      - 'benches/**'
      - 'src/**'
      - 'Cargo.toml'
      - 'Cargo.lock'
      - '.github/workflows/benchmark.yml'
  workflow_dispatch:

env:
  CARGO_TERM_COLOR: always
  RUST_BACKTRACE: 1

permissions:
  contents: read

jobs:
  benchmark:
    name: ${{ matrix.label }}
    strategy:
      fail-fast: false
      matrix:
        include:
          # aarch64 NEON — runtime dispatcher picks NEON. Every
          # `skip::skip_*` and `skip_class!`-generated fn has a hand-tuned
          # NEON code path exercised here.
          - os: macos-latest
            arch: aarch64
            tier: neon
            rustflags: ''
            label: macos-aarch64-neon

          # aarch64 with NEON forced off via `memspan_force_scalar`: the
          # dispatcher takes the scalar fallback on every call. Gives a
          # like-for-like scalar baseline on the same hardware as the NEON
          # tier above, so the NEON win is measurable without cross-runner
          # noise.
          - os: macos-latest
            arch: aarch64
            tier: scalar
            rustflags: '--cfg memspan_force_scalar'
            label: macos-aarch64-scalar

          # x86_64 default — runtime dispatcher picks whichever x86 tier
          # the runner supports. Standard ubuntu-latest is AMD EPYC 7763
          # (Milan): AVX2 yes, AVX-512 no. This tier exercises the AVX2
          # kernel in practice.
          - os: ubuntu-latest
            arch: x86_64
            tier: default
            rustflags: ''
            label: ubuntu-x86_64-default

          # Note: no AVX-512 bench tier. GitHub-hosted free runners are
          # AMD Milan (no AVX-512), and emulated numbers from Intel SDE
          # are ~5-10× off real hardware — not worth measuring. AVX-512
          # correctness is covered by the `test-sde` job in ci.yml.

          # x86_64 with AVX-512 disabled: forces the AVX2 dispatch branch
          # on runners that would otherwise pick AVX-512. Gives explicit
          # AVX2-tier numbers regardless of runner CPU.
          - os: ubuntu-latest
            arch: x86_64
            tier: avx2-max
            rustflags: '--cfg memspan_disable_avx512'
            label: ubuntu-x86_64-avx2-max

          # x86_64 with AVX-512 and AVX2 both disabled: forces the SSE4.2
          # dispatch branch. Every x86_64 CPU since ~2008 has SSE4.2, so
          # this tier exercises the SSE4.2 kernel on every runner.
          - os: ubuntu-latest
            arch: x86_64
            tier: sse42-max
            rustflags: '--cfg memspan_disable_avx512 --cfg memspan_disable_avx2'
            label: ubuntu-x86_64-sse42-max

          # x86_64 with every SIMD backend short-circuited: scalar-only
          # baseline. Pairs with `ubuntu-x86_64-default` to measure the
          # SIMD win on Linux/x86_64.
          - os: ubuntu-latest
            arch: x86_64
            tier: scalar
            rustflags: '--cfg memspan_force_scalar'
            label: ubuntu-x86_64-scalar

          # x86_64 with `-C target-cpu=native`: enables the full feature
          # set of the runner's build-time CPU for LLVM auto-vectorization
          # of scalar paths and maximum codegen quality for SIMD kernels.
          #
          # `native` uses `target_rustflags` (routed via the per-target
          # `CARGO_TARGET_*_RUSTFLAGS` env var) instead of the global
          # `rustflags` field. The global `RUSTFLAGS` applies to *every*
          # rustc invocation, including proc macro dylibs built for the
          # host, which then get codegen'd with host-CPU-specific
          # instructions and SIGILL when rustc loads them in a different
          # execution context. The per-target var scopes
          # `-C target-cpu=native` to the target crate only.
          - os: ubuntu-latest
            arch: x86_64
            tier: native
            rustflags: ''
            target_rustflags: '-C target-cpu=native'
            label: ubuntu-x86_64-native

          # Windows x86_64 — same dispatcher as Linux but validates the
          # MSVC toolchain handles the intrinsics-heavy modules.
          - os: windows-latest
            arch: x86_64
            tier: default
            rustflags: ''
            label: windows-x86_64-default

    runs-on: ${{ matrix.os }}
    env:
      RUSTFLAGS: ${{ matrix.rustflags }}
      # Per-target rustflags: only affects compilation of crates built
      # for the named triple, never build-dependencies (proc macros,
      # build scripts). The `native` tier uses this instead of
      # `RUSTFLAGS` to avoid SIGILL when rustc loads host-compiled
      # proc-macro dylibs that were codegen'd with `target-cpu=native`.
      # Empty for tiers that don't opt in — cargo treats empty as unset.
      CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: ${{ matrix.target_rustflags || '' }}
    steps:
      - uses: actions/checkout@v6

      - name: Install Rust
        run: rustup update stable --no-self-update && rustup default stable

      - name: Print CPU info (Linux)
        if: runner.os == 'Linux'
        shell: bash
        run: |
          echo "=== /proc/cpuinfo (first flags line) ==="
          grep -m1 '^flags' /proc/cpuinfo || true
          echo "=== lscpu ==="
          lscpu || true

      - name: Print CPU info (macOS)
        if: runner.os == 'macOS'
        shell: bash
        run: |
          echo "=== sysctl machdep.cpu ==="
          sysctl machdep.cpu || true
          echo "=== uname -m ==="
          uname -m

      - name: Print CPU info (Windows)
        if: runner.os == 'Windows'
        shell: pwsh
        run: |
          Get-CimInstance Win32_Processor | Select-Object Name, Manufacturer, NumberOfCores, NumberOfLogicalProcessors | Format-List

      - name: Cache cargo build and registry
        uses: actions/cache@v5
        with:
          path: |
            ~/.cargo/registry
            ~/.cargo/git
            target
          key: ${{ runner.os }}-bench-${{ matrix.tier }}-${{ hashFiles('**/Cargo.lock') }}
          restore-keys: |
            ${{ runner.os }}-bench-${{ matrix.tier }}-
            ${{ runner.os }}-bench-

      - name: Run benchmarks
        shell: bash
        run: cargo bench --benches -- --quick --output-format criterion | tee benchmark-all-${{ matrix.label }}.txt
        continue-on-error: false

      - name: Collect system info
        shell: bash
        run: |
          info="benchmark-info-${{ matrix.label }}.md"
          echo "## ${{ matrix.label }}" > "$info"
          echo "- OS: ${{ matrix.os }}" >> "$info"
          echo "- Arch: ${{ matrix.arch }}" >> "$info"
          echo "- SIMD tier: ${{ matrix.tier }}" >> "$info"
          echo "- Runner: ${{ runner.name }}" >> "$info"
          echo "- Runner arch (GH): ${{ runner.arch }}" >> "$info"
          echo "- RUSTFLAGS: \`${{ matrix.rustflags }}\`" >> "$info"
          if [ -n "${{ matrix.target_rustflags || '' }}" ]; then
            echo "- CARGO_TARGET_X86_64_UNKNOWN_LINUX_GNU_RUSTFLAGS: \`${{ matrix.target_rustflags || '' }}\`" >> "$info"
          fi
          echo "- Date: $(date -u +"%Y-%m-%d %H:%M:%S UTC")" >> "$info"

      - name: Upload benchmark results
        uses: actions/upload-artifact@v7
        with:
          name: benchmark-results-${{ matrix.label }}
          path: |
            benchmark-all-${{ matrix.label }}.txt
            benchmark-info-${{ matrix.label }}.md
            target/criterion/
          retention-days: 90