llmux 0.7.3

Zero-reload model switching for vLLM - manages multiple models on shared GPU
Documentation
{
  "models": {
    "llama-7b": {
      "model_path": "meta-llama/Llama-2-7b-chat-hf",
      "port": 8001,
      "eviction": { "weights": "offload", "process": "keep_running" },
      "extra_args": ["--gpu-memory-utilization", "0.9"]
    },
    "mistral-7b": {
      "model_path": "mistralai/Mistral-7B-Instruct-v0.2",
      "port": 8002,
      "eviction": { "weights": "discard", "process": "checkpoint" },
      "extra_args": ["--gpu-memory-utilization", "0.9"]
    }
  },
  "checkpoint": {
    "criu_path": "criu",
    "cuda_plugin_dir": "/usr/lib/criu/",
    "images_dir": "/tmp/llmux-checkpoints",
    "cuda_checkpoint_path": "cuda-checkpoint",
    "keep_images": true
  },
  "policy": {
    "policy_type": "fifo",
    "request_timeout_secs": 300,
    "drain_before_switch": true,
    "eviction": { "weights": "offload", "process": "keep_running" }
  },
  "port": 3000,
  "metrics_port": 9090,
  "admin_port": 3001,
  "warmup": false
}