apiVersion: v1
kind: Pod
metadata:
name: swift
namespace: orign
spec:
restartPolicy: Never
initContainers:
- name: init-swift
image: us-docker.pkg.dev/agentsea-dev/orign/server:latest
command: ["orign"]
args:
- prepare
- --dataset-type
- swift
- --url
- https://storage.googleapis.com/agentsea-dev-hub-exports/exports/2024-12-04/c42ee4be-5266-43c2-8321-0600d5eea4b5
- --split-ratio
- "0.8"
- --base-path
- /data/swift
volumeMounts:
- mountPath: /root/.cache/huggingface
name: huggingface-cache
- mountPath: /data/swift
name: swift-cache
containers:
- image: modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py310-torch2.4.0-1.20.0-LLM
imagePullPolicy: Always
name: swift
command: ["/bin/bash", "-c"]
args:
- |
pip install wandb && \
CUDA_VISIBLE_DEVICES=0,1,2,3 NPROC_PER_NODE=4 USE_HF=1 swift sft \
--model_type qwen2-vl-7b-instruct \
--model_id_or_path qwen/Qwen2-VL-7B-Instruct \
--sft_type full \
--max_length 8192 \
--dataset /data/swift/train.jsonl \
--val_dataset /data/swift/val.jsonl \
--eval_strategy epoch \
--deepspeed default-zero2 \
--dtype bf16 \
--num_train_epochs 1 \
--report_to wandb \
&& \
echo "Training done, listing files: " && ls && \
while true; do echo "waiting..."; sleep 2; done
env:
- name: WANDB_PROJECT
value: "orign"
- name: WANDB_LOG_MODEL
value: "checkpoint"
- name: WANDB_API_KEY
value: "replace-me"
- name: HF_TOKEN
value: "replace-me"
- name: USE_HF
value: "1"
resources:
limits:
memory: 100Gi
nvidia.com/gpu: "4"
requests:
memory: 100Gi
nvidia.com/gpu: "4"
volumeMounts:
- mountPath: /root/.cache/huggingface
name: huggingface-cache
- mountPath: /data/swift
name: swift-cache
- mountPath: /dev/shm
name: dshm
nodeSelector:
gpu-type: l40s-12xlarge
role: gpu
tolerations:
- effect: NoSchedule
key: gpu
operator: Equal
value: "true"
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
volumes:
- name: huggingface-cache
persistentVolumeClaim:
claimName: huggingface-cache-pvc
- name: swift-cache
persistentVolumeClaim:
claimName: swift-pvc
- name: dshm
emptyDir:
medium: Memory
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: swift-pvc
namespace: orign
spec:
storageClassName: gp2
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi