name: sample_data_processing_workflow
description: A sample workflow that demonstrates data processing with multiple jobs
jobs:
- name: data_download
command: wget https://example.com/data.csv -O input_data.csv
invocation_script: null
cancel_on_blocking_job_failure: false
resource_requirements: small_job
depends_on: null input_files: null
output_files:
- raw_data
input_user_data: null
output_user_data:
- download_metadata
scheduler: default_scheduler
- name: data_validation
command: python validate_data.py input_data.csv
invocation_script: |
#!/bin/bash
set -e
export PYTHONPATH=/opt/validation:$PYTHONPATH
cancel_on_blocking_job_failure: true
resource_requirements: small_job
depends_on:
- data_download input_files:
- raw_data
- validation_script
output_files:
- validated_data
input_user_data:
- download_metadata
output_user_data:
- validation_results
scheduler: default_scheduler
- name: data_analysis
command: python analyze_data.py validated_data.csv --output results.json
invocation_script: null
cancel_on_blocking_job_failure: true
resource_requirements: large_job depends_on:
- data_validation input_files:
- validated_data
- analysis_script
output_files:
- analysis_results
input_user_data:
- validation_results
output_user_data:
- final_analysis
scheduler: gpu_scheduler
files:
- name: raw_data
path: /data/input/raw_data.csv
- name: validated_data
path: /data/processed/validated_data.csv
- name: analysis_results
path: /data/output/results.json
- name: validation_script
path: /scripts/validate_data.py
- name: analysis_script
path: /scripts/analyze_data.py
user_data:
- name: download_metadata
data:
source_url: https://example.com/data.csv
download_timestamp: "2024-01-15T10:30:00Z"
file_size_bytes: 1048576
is_ephemeral: true
- name: validation_results
data:
validation_rules:
- no_nulls
- valid_dates
- numeric_ranges
passed: true
row_count: 10000
is_ephemeral: false
- name: final_analysis
data:
analysis_type: statistical_summary
confidence_level: 0.95
is_ephemeral: false
resource_requirements:
- name: small_job
num_cpus: 1
num_gpus: 0
num_nodes: 1
memory: 2g
runtime: PT30M
- name: large_job
num_cpus: 4
num_gpus: 1
num_nodes: 1
memory: 16g
runtime: PT2H
slurm_schedulers:
- name: default_scheduler
account: project_account
gres: null
mem: 8G
nodes: 1
ntasks_per_node: 1
partition: general
qos: normal
tmp: 10G
walltime: "01:00:00"
extra: "--constraint=haswell"
- name: gpu_scheduler
account: gpu_project
gres: "gpu:1" mem: 32G
nodes: 1
ntasks_per_node: 1
partition: gpu
qos: high
tmp: 50G
walltime: "04:00:00"
extra: "--constraint=v100"