# Converge Analytics - Dataset download helpers
# Run `just --list` to see all available recipes
# Default recipe: show help
default:
@just --list
# =============================================================================
# Doctor - Environment Health Check
# =============================================================================
# Check component health
doctor:
#!/usr/bin/env bash
echo " Checking analytics environment..."
if [ -d "data" ]; then
count=$(ls -1 data/*.parquet 2>/dev/null | wc -l | tr -d ' ')
echo " ✓ data/ present with $count parquet files"
else
echo " ⚠ data/ missing → run 'just download-train'"
fi
if command -v curl >/dev/null 2>&1; then
echo " ✓ curl available"
else
echo " ✗ curl not found"
fi
# =============================================================================
# Dataset download helpers
# Default dataset points at California Housing parquet split.
DATASET_URL := "https://huggingface.co/datasets/gvlassis/california_housing/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"
# Download the training dataset into data/ for the training flow.
download-train DATASET_URL=DATASET_URL:
mkdir -p data
size=$$(curl -sI "{{DATASET_URL}}" | awk -F': ' 'tolower($1)=="content-length"{print $2}' | tr -d '\r'); \
if [ -n "$$size" ]; then echo "Download size: $$size bytes"; else echo "Download size: unknown"; fi
curl -fL "{{DATASET_URL}}" -o data/california_housing_train.parquet
# Download the inference dataset into repo root for agent_loop example.
download-infer DATASET_URL=DATASET_URL:
size=$$(curl -sI "{{DATASET_URL}}" | awk -F': ' 'tolower($1)=="content-length"{print $2}' | tr -d '\r'); \
if [ -n "$$size" ]; then echo "Download size: $$size bytes"; else echo "Download size: unknown"; fi
curl -fL "{{DATASET_URL}}" -o california_housing_train.parquet
# List supported dataset shortcuts.
list-models:
@echo "airbnb (kraina/airbnb)"
@echo "california-housing (gvlassis/california_housing)"
@echo "finance-tweet (megagonlabs/finance-tweet)"
@echo "kaggle-airbnb (kaggle/airbnb)"
@echo "mteb-reddit (mteb/reddit)"
@echo "mteb-twitter (mteb/twitter)"
@echo "nyc-taxi-trip-duration (nyu-mll/nyc-taxi-trip-duration)"
@echo "openflights (OpenFlights)"
@echo "procurement (theRACER/Procurement)"
@echo "supply-chain (mhimchak/supply_chain_data)"
@echo "tweet-eval (cardiffnlp/tweet_eval)"
@echo "uber-movement (UberMovement/uber-movement)"
@echo "walmart-trips (shuyan/walmart-trips)"
# Convenience loaders for datasets listed in datasets/.
load-airbnb:
dataset="kraina/airbnb"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-california-housing:
dataset="gvlassis/california_housing"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-finance-tweet:
dataset="megagonlabs/finance-tweet"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-kaggle-airbnb:
dataset="kaggle/airbnb"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-mteb-reddit:
dataset="mteb/reddit"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-mteb-twitter:
dataset="mteb/twitter"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-nyc-taxi-trip-duration:
dataset="nyu-mll/nyc-taxi-trip-duration"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-openflights:
dataset="OpenFlights"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-procurement:
dataset="theRACER/Procurement"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-supply-chain:
dataset="mhimchak/supply_chain_data"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-tweet-eval:
dataset="cardiffnlp/tweet_eval"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-uber-movement:
dataset="UberMovement/uber-movement"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet
load-walmart-trips:
dataset="shuyan/walmart-trips"; \
url="https://huggingface.co/datasets/$$dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"; \
just download-train DATASET_URL="$$url"; \
cp data/california_housing_train.parquet california_housing_train.parquet