llmux 0.7.3 - Docs.rs

# Multi-stage build for llmux with vLLM
#
# This Dockerfile builds the llmux binary and packages it with vLLM,
# enabling zero-reload model switching for multiple models on shared GPU.
#
# Includes cuda-checkpoint and CRIU with CUDA plugin for sleep levels 3 and 4.

# =============================================================================
# Stage 1: Build the Rust binary
# =============================================================================
FROM rust:1.90-bookworm AS builder

WORKDIR /build

# Install dependencies for building
RUN apt-get update && apt-get install -y \
  pkg-config \
  libssl-dev \
  && rm -rf /var/lib/apt/lists/*

# Copy project files
COPY Cargo.toml Cargo.lock ./
COPY src ./src
COPY tests ./tests

# Build the llmux binary in release mode
RUN cargo build --release --bin llmux

# =============================================================================
# Stage 2: Build CRIU with CUDA plugin
# =============================================================================
FROM ubuntu:22.04 AS criu-builder

RUN apt-get update && apt-get install -y \
  build-essential \
  pkg-config \
  libprotobuf-dev \
  libprotobuf-c-dev \
  protobuf-c-compiler \
  protobuf-compiler \
  python3-protobuf \
  libnl-3-dev \
  libcap-dev \
  libaio-dev \
  libgnutls28-dev \
  libnet1-dev \
  uuid-dev \
  git \
  && rm -rf /var/lib/apt/lists/*

RUN git clone --depth 1 --branch v4.1 https://github.com/checkpoint-restore/criu.git /criu
WORKDIR /criu
RUN make -j$(nproc) && \
    make -C plugins/cuda -j$(nproc)

# =============================================================================
# Stage 3: Create the final image with vLLM
# =============================================================================
FROM vllm/vllm-openai:v0.15.1

# Apply patches:
# 1. Fix sleep mode regression (vllm#32714): `with A and B:` -> `with A, B:`
# 2. NCCL suspend/resume for cuda-checkpoint at TP>1
COPY patches/fix-sleep-mode-v0.15.1.patch /tmp/
COPY patches/nccl-suspend-resume-v0.15.1.patch /tmp/
RUN cd /usr/local/lib/python3.12/dist-packages && \
    patch -p1 < /tmp/fix-sleep-mode-v0.15.1.patch && \
    patch -p1 < /tmp/nccl-suspend-resume-v0.15.1.patch && \
    rm /tmp/*.patch

# Install cuda-checkpoint from NVIDIA's repo (pre-built binary)
ADD https://raw.githubusercontent.com/NVIDIA/cuda-checkpoint/main/bin/x86_64_Linux/cuda-checkpoint \
    /usr/local/bin/cuda-checkpoint
RUN chmod +x /usr/local/bin/cuda-checkpoint

# Install CRIU and CUDA plugin from builder stage
COPY --from=criu-builder /criu/criu/criu /usr/local/bin/criu
COPY --from=criu-builder /criu/plugins/cuda/cuda_plugin.so /usr/lib/criu/cuda_plugin.so

# CRIU runtime dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
  libnl-3-200 \
  libprotobuf-c1 \
  iptables \
  libnet1 \
  && rm -rf /var/lib/apt/lists/*

# Copy the llmux binary from builder
COPY --from=builder /build/target/release/llmux /usr/local/bin/llmux

# Create config and checkpoint directories
RUN mkdir -p /etc/llmux /tmp/llmux-checkpoints

# Default config location
ENV LLMUX_CONFIG=/etc/llmux/config.json

# Expose the proxy port (default 3000) and metrics port (default 9090)
EXPOSE 3000 9090

# llmux will spawn vLLM processes internally
# It expects a config file at $LLMUX_CONFIG
ENTRYPOINT ["/usr/local/bin/llmux"]
CMD ["--config", "/etc/llmux/config.json"]