llmux 0.7.3

Zero-reload model switching for vLLM - manages multiple models on shared GPU
Documentation
{
  "models": {
    "llama-8b": {
      "model_path": "NousResearch/Meta-Llama-3.1-8B-Instruct",
      "port": 8005,
      "eviction": { "weights": "retain", "process": "cuda_suspend" },
      "extra_args": [
        "--max-model-len", "512",
        "--gpu-memory-utilization", "0.85",
        "--enforce-eager",
        "--disable-custom-all-reduce",
        "--tensor-parallel-size", "2"
      ]
    }
  },
  "checkpoint": {
    "cuda_checkpoint_path": "cuda-checkpoint"
  },
  "vllm_command": "/home/fergus/.local/bin/vllm",
  "port": 3333,
  "metrics_port": 0
}