kind: Container
metadata:
name: vllm-qwen
namespace: test
labels:
type: inference
platform: runpod
accelerators:
- "1:L4"
image: "vllm/vllm-openai:latest"
command: |
env && python3 -m vllm.entrypoints.openai.api_server \
--model $MODEL \
--port 8000 \
--host 0.0.0.0 \
--gpu-memory-utilization 0.8 \
--max-model-len 2048 \
--max-num-seqs 1
env:
- key: MODEL
value: Qwen/Qwen2.5-0.5B-Instruct
- key: HF_TOKEN
secret_name: HF_TOKEN
restart: Always
proxy_port: 8000