nebulous 0.1.86

A globally distributed container orchestrator
Documentation
kind: Container
metadata:
  name: trl-capybara
  namespace: test
  labels:
    type: training
image: "huggingface/trl-latest-gpu:latest"
command: |
  source activate trl && trl sft \
    --model_name_or_path $MODEL \
    --dataset_name agentsea/Capybara-slim \
    --dataset_train_split "train" \
    --dataset_test_split "test" \
    --output_dir /output/$NEBU_CONTAINER_ID \
    --torch_dtype bfloat16 \
    --max_seq_length 2048 \
    --per_device_train_batch_size 2 \
    --per_device_eval_batch_size 2 \
    --use_peft true \
    --save_strategy steps \
    --save_steps 1 \
    --save_total_limit 3 \
    --num_train_epochs 1 
platform: runpod
env:
  - key: MODEL
    value: Qwen/Qwen2.5-0.5B-Instruct
  - key: DATASET
    value: trl-lib/Capybara
volumes:
  - source: /output
    dest: s3://nebulous-rs/adapter-test-jobs/
    driver: RCLONE_COPY
    continuous: true
accelerators:
  - "1:L4"
restart: Never