#!/usr/bin/env bash

set -euo pipefail

SCRIPT_DIR="$(dirname "$0")"
DEST_DIR="$SCRIPT_DIR/data/reference"
mkdir -p "$DEST_DIR"

REFERENCE_BASENAME="GRCh38_full_analysis_set_plus_decoy_hla.fa"
REFERENCE_URL="https://ftp-trace.ncbi.nih.gov/1000genomes/ftp/technical/reference/GRCh38_reference_genome/$REFERENCE_BASENAME"

REFERENCE_FA="$DEST_DIR/$REFERENCE_BASENAME.gz"
REFERENCE_FAI="$REFERENCE_FA.fai"

if [[ -f "$REFERENCE_FA" && -f "$REFERENCE_FAI" ]]; then
    echo "Reference already present: $REFERENCE_FA"
    exit 0
fi

need_install=0
command -v bgzip >/dev/null 2>&1 || need_install=1
command -v samtools >/dev/null 2>&1 || need_install=1

if [[ $need_install -eq 1 ]]; then
    echo "Installing required tools (tabix, samtools)..."
    if command -v apt-get >/dev/null 2>&1; then
        apt-get update
        apt-get install -y tabix samtools
    else
        echo "apt-get not found. Install tabix and samtools manually." >&2
        exit 1
    fi
fi

echo "Downloading reference fasta..."
curl -L -o "$DEST_DIR/$REFERENCE_BASENAME" "$REFERENCE_URL"

echo "Compressing with bgzip..."
bgzip "$DEST_DIR/$REFERENCE_BASENAME" --threads $( ls -d /sys/devices/system/cpu/cpu[[:digit:]]* | wc -w )

echo "Indexing..."
bgzip -r "$REFERENCE_FA"
samtools faidx "$REFERENCE_FA"

echo "Reference ready: $REFERENCE_FA"
