orign 0.2.3

A globally distributed container orchestrator
Documentation
apiVersion: v1
kind: Pod
metadata:
  name: swift
  namespace: orign
spec:
  restartPolicy: Never
  initContainers:
  - name: init-swift
    image: us-docker.pkg.dev/agentsea-dev/orign/server:latest
    command: ["orign"]
    args:
      - prepare
      - --dataset-type
      - swift
      - --url
      - https://storage.googleapis.com/agentsea-dev-hub-exports/exports/2024-12-04/c42ee4be-5266-43c2-8321-0600d5eea4b5
      - --split-ratio
      - "0.8"
      - --base-path
      - /data/swift
    volumeMounts:
    - mountPath: /root/.cache/huggingface
      name: huggingface-cache
    - mountPath: /data/swift
      name: swift-cache
  containers:
  - image: modelscope-registry.us-west-1.cr.aliyuncs.com/modelscope-repo/modelscope:ubuntu22.04-cuda12.4.0-py310-torch2.4.0-1.20.0-LLM
    imagePullPolicy: Always
    name: swift
    command: ["/bin/bash", "-c"]
    args: 
      - |
        pip install wandb && \
        CUDA_VISIBLE_DEVICES=0,1,2,3 NPROC_PER_NODE=4 USE_HF=1 swift sft \
        --model_type qwen2-vl-7b-instruct \
        --model_id_or_path qwen/Qwen2-VL-7B-Instruct \
        --sft_type full \
        --max_length 8192 \
        --dataset /data/swift/train.jsonl \
        --val_dataset /data/swift/val.jsonl \
        --eval_strategy epoch \
        --deepspeed default-zero2 \
        --dtype bf16 \
        --num_train_epochs 1 \
        --report_to wandb \
        && \
        echo "Training done, listing files: " && ls && \
        while true; do echo "waiting..."; sleep 2; done
    env:
    - name: WANDB_PROJECT
      value: "orign"
    - name: WANDB_LOG_MODEL
      value: "checkpoint"
    - name: WANDB_API_KEY
      value: "replace-me"
    - name: HF_TOKEN
      value: "replace-me"
    - name: USE_HF
      value: "1"
    resources:
      limits:
        memory: 100Gi
        nvidia.com/gpu: "4"
      requests:
        memory: 100Gi
        nvidia.com/gpu: "4"
    volumeMounts:
    - mountPath: /root/.cache/huggingface
      name: huggingface-cache
    - mountPath: /data/swift
      name: swift-cache
    - mountPath: /dev/shm
      name: dshm
  nodeSelector:
    gpu-type: l40s-12xlarge
    role: gpu
  tolerations:
  - effect: NoSchedule
    key: gpu
    operator: Equal
    value: "true"
  - effect: NoSchedule
    key: nvidia.com/gpu
    operator: Exists
  volumes:
  - name: huggingface-cache
    persistentVolumeClaim:
      claimName: huggingface-cache-pvc
  - name: swift-cache
    persistentVolumeClaim:
      claimName: swift-pvc
  - name: dshm
    emptyDir:
      medium: Memory

---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: swift-pvc
  namespace: orign
spec:
  storageClassName: gp2 
  accessModes:
    - ReadWriteOnce
  resources:
    requests:
      storage: 100Gi