kind: Container
metadata:
name: trl-capybara
namespace: test
labels:
type: training
image: "huggingface/trl-latest-gpu:latest"
command: |
source activate trl && trl sft \
--model_name_or_path $MODEL \
--dataset_name agentsea/Capybara-slim \
--dataset_train_split "train" \
--dataset_test_split "test" \
--output_dir /output/$NEBU_CONTAINER_ID \
--torch_dtype bfloat16 \
--max_seq_length 2048 \
--per_device_train_batch_size 2 \
--per_device_eval_batch_size 2 \
--use_peft true \
--save_strategy steps \
--save_steps 1 \
--save_total_limit 3 \
--num_train_epochs 1
platform: runpod
env:
- key: MODEL
value: Qwen/Qwen2.5-0.5B-Instruct
- key: DATASET
value: trl-lib/Capybara
volumes:
- source: /output
dest: s3://nebulous-rs/adapter-test-jobs/
driver: RCLONE_COPY
continuous: true
accelerators:
- "1:L4"
restart: Never