llmux 0.2.0

Zero-reload model switching for vLLM - manages multiple models on shared GPU
Documentation
{
  "models": {
    "llama-7b": {
      "model_path": "meta-llama/Llama-2-7b-chat-hf",
      "port": 8001,
      "gpu_memory_utilization": 0.9,
      "tensor_parallel_size": 1,
      "dtype": "auto",
      "sleep_level": 1
    },
    "mistral-7b": {
      "model_path": "mistralai/Mistral-7B-Instruct-v0.2",
      "port": 8002,
      "gpu_memory_utilization": 0.9,
      "tensor_parallel_size": 1,
      "dtype": "auto",
      "sleep_level": 1
    }
  },
  "policy": {
    "policy_type": "fifo",
    "request_timeout_secs": 300,
    "drain_before_switch": true,
    "sleep_level": 1
  },
  "port": 3000,
  "metrics_port": 9090
}