llama-server 0.1.1

Download, embed, and run llama.cpp in your Rust projects
Documentation
name: Build
on:
  push:
    branches:
      - master
    paths:
      - '.github/workflows/build.yml'
  workflow_dispatch: {}
  schedule:
    - cron: '0 0 * * *'

concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

jobs:
  find:
    runs-on: ubuntu-latest
    permissions:
      packages: read
    outputs:
      build: ${{ steps.get_latest.outputs.tag }}
    steps:
      - name: Get latest build
        id: get_latest
        run: |
          build=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest | jq -r .tag_name)
          echo "llama.cpp latest build: $build"
          echo "tag=$build" >> "$GITHUB_OUTPUT"

  linux-cuda:
    needs: find
    runs-on: ubuntu-latest
    steps:
      - name: Free Disk Space (Ubuntu)
        uses: ggml-org/free-disk-space@v1.3.1
        with:
          tool-cache: false
          android: true
          dotnet: true
          haskell: true
          large-packages: true
          docker-images: true
          swap-storage: true

      - name: Clone llama.cpp@${{ needs.find.outputs.build }}
        run: git clone https://github.com/ggerganov/llama.cpp --branch=${{ needs.find.outputs.build }} --depth=1

      - name: Build `llama-server`
        run: docker build -f llama.cpp/.devops/cuda.Dockerfile --target server --tag llama-server-cuda llama.cpp

      - name: Create llama-server container
        run: docker create --name llama-server llama-server-cuda

      - name: Copy build artifacts from container
        run: docker cp llama-server:/app .

      - name: Copy CUDA runtime from container
        run: |
          CUDA_VERSION=$(docker run --rm --entrypoint "" llama-server-cuda \
              bash -c "ls /usr/local | grep '^cuda-[0-9]\+\.[0-9]\+$' | sort -V | tail -n1 | sed 's/cuda-//'")

          CUDA_MAJOR_VERSION=${CUDA_VERSION%%.*}

          docker cp -L llama-server:/usr/local/cuda-$CUDA_VERSION/targets/x86_64-linux/lib/libcudart.so.$CUDA_MAJOR_VERSION app/.
          docker cp -L llama-server:/usr/local/cuda-$CUDA_VERSION/targets/x86_64-linux/lib/libcublas.so.$CUDA_MAJOR_VERSION app/.
          docker cp -L llama-server:/usr/local/cuda-$CUDA_VERSION/targets/x86_64-linux/lib/libcublasLt.so.$CUDA_MAJOR_VERSION app/.

      - name: Set $ORIGIN as RPATH for `llama-server` and shared libraries
        run: patchelf --set-rpath '$ORIGIN' app/llama-server app/*.so*

      - name: Separate CUDA backend
        run: |
          mkdir cuda
          mv app/libcudart* app/libcublas* app/libggml-cuda.so cuda/.

      - name: Pack llama-server
        run: |
          cp llama.cpp/LICENSE app/.
          cd app
          zip -r ../llama-server-${{ needs.find.outputs.build }}-linux-x64.zip *

      - name: Pack CUDA backend
        run: |
          cp llama.cpp/LICENSE cuda/.
          cd cuda
          zip -r ../backend-cuda-${{ needs.find.outputs.build }}-linux-x64.zip *

      - name: Upload llama-server
        uses: actions/upload-artifact@v4
        with:
          path: llama-server-${{ needs.find.outputs.build }}-linux-x64.zip
          name: llama-server-linux-x64

      - name: Upload CUDA backend
        uses: actions/upload-artifact@v4
        with:
          path: backend-cuda-${{ needs.find.outputs.build }}-linux-x64.zip
          name: backend-cuda-linux-x64

  linux-hip:
    needs: find
    runs-on: ubuntu-latest
    steps:
      - name: Free Disk Space (Ubuntu)
        uses: ggml-org/free-disk-space@v1.3.1
        with:
          tool-cache: false
          android: true
          dotnet: true
          haskell: true
          large-packages: true
          docker-images: true
          swap-storage: true

      - name: Clone llama.cpp@${{ needs.find.outputs.build }}
        run: git clone https://github.com/ggerganov/llama.cpp --branch=${{ needs.find.outputs.build }} --depth=1

      - name: Build `llama-server`
        run: docker build -f llama.cpp/.devops/rocm.Dockerfile --target server --tag llama-server llama.cpp

      - name: Copy HIP shared libraries
        run: |
          docker run --name hip --entrypoint "" llama-server \
            bash -c "ldd /app/libggml-hip.so | awk '/=>/ {print \$3}' | grep -E 'rocm' | xargs -I{} cp -v {} /app/."

      - name: Extract `llama-server` from container
        run: docker cp hip:/app .

      - name: Copy libnuma.so library
        run: docker cp -L hip:/usr/lib/x86_64-linux-gnu/libnuma.so.1 app/.

      - name: Delete container and image
        run: |
          docker rm hip
          docker rmi llama-server

      - name: Set $ORIGIN as RPATH for `llama-server` and shared libraries
        run: patchelf --set-rpath '$ORIGIN' app/llama-server app/*.so*

      - name: Pack HIP backend
        run: |
          mkdir hip
          mv app/libggml-hip.so app/*.so.* llama.cpp/LICENSE hip/.
          cd hip
          zip -r ../backend-hip-${{ needs.find.outputs.build }}-linux-x64.zip *

      - name: Upload HIP backend
        uses: actions/upload-artifact@v4
        with:
          path: backend-hip-${{ needs.find.outputs.build }}-linux-x64.zip
          name: backend-hip-linux-x64

  macos:
    needs: find
    runs-on: ubuntu-latest
    strategy:
      matrix:
        arch: [arm64, x64]
    steps:
      - name: Download macOS release
        run: curl -LO https://github.com/ggml-org/llama.cpp/releases/download/${{ needs.find.outputs.build }}/llama-${{ needs.find.outputs.build }}-bin-macos-${{ matrix.arch }}.tar.gz

      - name: Extract macOS release
        run: tar -xzf llama-${{ needs.find.outputs.build }}-bin-macos-${{ matrix.arch }}.tar.gz

      - name: Pack llama-server
        run: |
          mkdir app
          mv llama-${{ needs.find.outputs.build }}/llama-server llama-${{ needs.find.outputs.build }}/*.dylib llama-${{ needs.find.outputs.build }}/LICENSE* app/.
          cd app
          zip -r ../llama-server-${{ needs.find.outputs.build }}-macos-${{ matrix.arch }}.zip *

      - name: Upload llama-server
        uses: actions/upload-artifact@v4
        with:
          path: llama-server-${{ needs.find.outputs.build }}-macos-${{ matrix.arch }}.zip
          name: llama-server-macos-${{ matrix.arch }}

  windows:
    needs: find
    runs-on: ubuntu-latest
    steps:
      - name: Download Windows release
        run: curl -LO https://github.com/ggml-org/llama.cpp/releases/download/${{ needs.find.outputs.build }}/llama-${{ needs.find.outputs.build }}-bin-win-cpu-x64.zip

      - name: Unzip Windows release
        run: unzip llama-${{ needs.find.outputs.build }}-bin-win-cpu-x64.zip

      - name: Pack llama-server
        run: |
          mkdir app
          mv llama-server.exe *.dll app/.
          cd app
          zip -r ../llama-server-${{ needs.find.outputs.build }}-windows-x64.zip *

      - name: Upload llama-server
        uses: actions/upload-artifact@v4
        with:
          path: llama-server-${{ needs.find.outputs.build }}-windows-x64.zip
          name: llama-server-windows-x64

  windows-cuda:
    needs: find
    runs-on: ubuntu-latest
    steps:
      - name: Download Windows CUDA release
        run: |
          version=$(curl -s https://api.github.com/repos/ggml-org/llama.cpp/releases/latest | \
            jq -r '.assets | map(.name | capture("cudart-.*-cuda-(?<version>[^-]+)-")) | .[0] | .version')

          curl -L -o llama-server-windows-x64.zip \
            https://github.com/ggml-org/llama.cpp/releases/download/${{ needs.find.outputs.build }}/llama-${{ needs.find.outputs.build }}-bin-win-cuda-$version-x64.zip

          curl -L -o cudart-windows-x64.zip \
            https://github.com/ggml-org/llama.cpp/releases/download/${{ needs.find.outputs.build }}/cudart-llama-bin-win-cuda-$version-x64.zip

      - name: Unzip Windows CUDA release
        run: |
          unzip llama-server-windows-x64.zip
          unzip cudart-windows-x64.zip

      - name: Pack Windows CUDA backend
        run: |
          mkdir cuda
          mv ggml-cuda.dll cudart*.dll cublas*.dll cuda/.
          cd cuda
          zip -r ../backend-cuda-${{ needs.find.outputs.build }}-windows-x64.zip *

      - name: Upload CUDA backend
        uses: actions/upload-artifact@v4
        with:
          path: backend-cuda-${{ needs.find.outputs.build }}-windows-x64.zip
          name: backend-cuda-windows-x64

  windows-hip:
    needs: find
    runs-on: ubuntu-latest
    steps:
      - name: Download Windows HIP release
        run: curl -LO https://github.com/ggml-org/llama.cpp/releases/download/${{ needs.find.outputs.build }}/llama-${{ needs.find.outputs.build }}-bin-win-hip-radeon-x64.zip

      - name: Unzip Windows HIP release
        run: unzip llama-${{ needs.find.outputs.build }}-bin-win-hip-radeon-x64.zip

      - name: Pack Windows HIP backend
        run: |
          mkdir hip
          mv ggml-hip.dll hipblas*.dll rocblas.dll hipblaslt rocblas hip/.
          cd hip
          zip -r ../backend-hip-${{ needs.find.outputs.build }}-windows-x64.zip *

      - name: Upload HIP backend
        uses: actions/upload-artifact@v4
        with:
          path: backend-hip-${{ needs.find.outputs.build }}-windows-x64.zip
          name: backend-hip-windows-x64

  release:
    needs: [find, linux-cuda, linux-hip, macos, windows, windows-cuda, windows-hip]
    runs-on: ubuntu-latest
    permissions:
      contents: write
    steps:
      - name: Download artifacts
        uses: actions/download-artifact@v4
        with:
          path: ./release
          merge-multiple: true

      - name: Create release
        uses: actions/github-script@v3
        with:
          github-token: ${{secrets.GITHUB_TOKEN}}
          script: |
            const path = require('path');
            const fs = require('fs');
            const tag = '${{ needs.find.outputs.build }}';

            try {
              await github.repos.getReleaseByTag({
                owner: context.repo.owner,
                repo: context.repo.repo,
                tag,
              });
              console.log(`Release for build ${tag} already exists. Skipping...`);
              return;
            } catch (error) {
              if (error.status !== '404') throw error;
            }

            const release = await github.repos.createRelease({
              owner: context.repo.owner,
              repo: context.repo.repo,
              tag_name: tag,
            });

            for (let file of fs.readdirSync('./release')) {
              if (path.extname(file) === '.zip') {
                console.log('uploadReleaseAsset', file);

                await github.repos.uploadReleaseAsset({
                  owner: context.repo.owner,
                  repo: context.repo.repo,
                  release_id: release.data.id,
                  name: file,
                  data: fs.readFileSync(`./release/${file}`)
                });
              }
            }