apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: huggingface-cache-pvc
spec:
storageClassName: gp2
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: orign-vllm-{{model-name}}
spec:
replicas: 1
selector:
matchLabels:
app: orign-vllm-{{model-name}}
template:
metadata:
labels:
app: orign-vllm-{{model-name}}
spec:
hostIPC: true
nodeSelector:
role: gpu
gpu-type: l40s-2xlarge
containers:
- name: orign-vllm-{{model-name}}
image: us-docker.pkg.dev/agentsea-dev/orign/vllm:latest
args: ["--model", "allenai/Molmo-7B-D-0924", "--dtype", "float32", "--trust-remote-code"]
ports:
- containerPort: 8000
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: huggingface-secret
key: HUGGING_FACE_HUB_TOKEN
resources:
limits:
nvidia.com/gpu: 1
requests:
nvidia.com/gpu: 1
volumeMounts:
- name: huggingface-cache
mountPath: /root/.cache/huggingface
volumes:
- name: huggingface-cache
persistentVolumeClaim:
claimName: huggingface-cache-pvc
---
apiVersion: v1
kind: Service
metadata:
name: orign-vllm-{{model-name}}
spec:
selector:
app: orign-vllm
ports:
- protocol: TCP
port: 8000
targetPort: 8000
type: LoadBalancer