#!/usr/bin/env bash
set -euo pipefail

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"

ARTIFACTS_DIR="${ARTIFACTS_DIR:-$ROOT_DIR/artifacts}"
REPORT_PREFIX="${REPORT_PREFIX:-soak_failure}"
SLO_GATE_SCRIPT="${SLO_GATE_SCRIPT:-$ROOT_DIR/scripts/slo_gate.sh}"
SOAK_CYCLES="${SOAK_CYCLES:-10}"
INGEST_COUNT="${INGEST_COUNT:-1200}"
INGEST_ROUNDS="${INGEST_ROUNDS:-3}"
INJECT_EVERY="${INJECT_EVERY:-3}"
MAX_ERROR_RATE="${MAX_ERROR_RATE:-0.05}"
MAX_P95_MICROS="${MAX_P95_MICROS:-200000}"
MAX_P99_MICROS="${MAX_P99_MICROS:-300000}"

mkdir -p "$ARTIFACTS_DIR"
LOG_DIR="$ARTIFACTS_DIR/${REPORT_PREFIX}_logs"
mkdir -p "$LOG_DIR"

REPORT_JSON="$ARTIFACTS_DIR/${REPORT_PREFIX}_report.json"
REPORT_MD="$ARTIFACTS_DIR/${REPORT_PREFIX}_report.md"
METRICS_FILE="$ARTIFACTS_DIR/${REPORT_PREFIX}_metrics.prom"
GATE_JSON="$ARTIFACTS_DIR/${REPORT_PREFIX}_slo_gate.json"
GATE_MD="$ARTIFACTS_DIR/${REPORT_PREFIX}_slo_gate.md"

TIMESTAMP_UTC="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
COMMIT_SHA="$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")"

extract_json_number() {
  local key="$1"
  local file="$2"
  grep -E "\"$key\"[[:space:]]*:" "$file" | head -n1 | sed -E 's/.*: ([^,}]+).*/\1/'
}

percentile() {
  local p="$1"
  shift
  local values=("$@")
  local n="${#values[@]}"
  if [[ "$n" -eq 0 ]]; then
    echo 0
    return
  fi
  local sorted_str
  sorted_str="$(printf "%s\n" "${values[@]}" | sort -n)"
  local sorted=()
  while IFS= read -r line; do
    sorted+=("$line")
  done <<<"$sorted_str"
  local rank=$(( (p * n + 99) / 100 ))
  if [[ "$rank" -lt 1 ]]; then
    rank=1
  fi
  local idx=$((rank - 1))
  echo "${sorted[$idx]}"
}

now_ns() {
  perl -MTime::HiRes=time -e 'printf "%.0f\n", time() * 1000000000'
}

echo "Building ir binary..."
cargo build --bin ir >/dev/null

WORKDIR="$(mktemp -d)"
trap 'rm -rf "$WORKDIR"' EXIT
echo "Preparing soak dataset in $WORKDIR..."

for id in $(seq 1 1200); do
  n1=$((id + 1))
  n2=$((id + 2))
  (
    cd "$WORKDIR"
    "$ROOT_DIR/target/debug/ir" ingest-node "$id" 1 "$n1,$n2"
  ) >/dev/null
done

q_fraud='MATCH (n) WHERE vector.cosine(n.embedding, $vec) > 0.80 RETURN n LIMIT 20'
q_reco='MATCH (n) RETURN n LIMIT 25'
q_supply='MATCH (n) WHERE vector.cosine(n.embedding, $vec) > 0.20 RETURN n LIMIT 50'

latencies=()
ingest_eps_values=()
cycle_rows=""
query_total=0
query_errors=0
total_ingest_accepted=0
total_ingest_rejected=0
injection_attempts=0
injection_recover_failures=0

for cycle in $(seq 1 "$SOAK_CYCLES"); do
  cycle_prefix="cycle_${cycle}"
  ingest_log="$LOG_DIR/${cycle_prefix}_ingest.log"
  ingest_start_id=$((600000 + cycle * 5000))

  ingest_start_ns="$(now_ns)"
  if (
    cd "$WORKDIR"
    "$ROOT_DIR/target/debug/ir" ingest-batch-edge-loop "$ingest_start_id" "$INGEST_COUNT" "$INGEST_ROUNDS" 1 "p4-soak-${cycle}"
  ) >"$ingest_log" 2>&1; then
    accepted="$(sed -nE 's/.*accepted=([0-9]+).*/\1/p' "$ingest_log" | head -n1)"
    rejected="$(sed -nE 's/.*rejected=([0-9]+).*/\1/p' "$ingest_log" | head -n1)"
    accepted="${accepted:-0}"
    rejected="${rejected:-0}"
  else
    accepted=0
    rejected=0
    query_errors=$((query_errors + 1))
  fi
  ingest_end_ns="$(now_ns)"
  ingest_elapsed_ns=$((ingest_end_ns - ingest_start_ns))
  ingest_elapsed_sec="$(awk -v ns="$ingest_elapsed_ns" 'BEGIN { printf "%.6f", ns / 1000000000.0 }')"
  ingest_eps="$(awk -v events="$accepted" -v sec="$ingest_elapsed_sec" 'BEGIN { if (sec <= 0) { print "0.00"; } else { printf "%.2f", events / sec; } }')"
  ingest_eps_values+=("$ingest_eps")
  total_ingest_accepted=$((total_ingest_accepted + accepted))
  total_ingest_rejected=$((total_ingest_rejected + rejected))

  for profile in fraud reco supply; do
    query_log="$LOG_DIR/${cycle_prefix}_${profile}.log"
    query_str="$q_reco"
    if [[ "$profile" == "fraud" ]]; then
      query_str="$q_fraud"
    elif [[ "$profile" == "supply" ]]; then
      query_str="$q_supply"
    fi

    if (
      cd "$WORKDIR"
      "$ROOT_DIR/target/debug/ir" query "$query_str"
    ) >"$query_log" 2>&1; then
      latency="$(extract_json_number "latency_micros" "$query_log" || true)"
      if [[ -n "$latency" ]]; then
        latencies+=("$latency")
      else
        query_errors=$((query_errors + 1))
      fi
    else
      query_errors=$((query_errors + 1))
    fi
    query_total=$((query_total + 1))
  done

  recovery_ok=true
  if (( cycle % INJECT_EVERY == 0 )); then
    injection_attempts=$((injection_attempts + 1))
    wal_file="$(ls "$WORKDIR"/data/wal/ir.wal.* 2>/dev/null | sort | tail -n1 || true)"
    if [[ -z "$wal_file" ]]; then
      recovery_ok=false
      injection_recover_failures=$((injection_recover_failures + 1))
    else
      printf 'TAILCORRUPT' >>"$wal_file"
      recover_log="$LOG_DIR/${cycle_prefix}_recover.log"
      if ! (
        cd "$WORKDIR"
        "$ROOT_DIR/target/debug/ir" recover
      ) >"$recover_log" 2>&1; then
        recovery_ok=false
        injection_recover_failures=$((injection_recover_failures + 1))
      fi
    fi
  fi

  cycle_rows+="- cycle ${cycle}: ingest_eps=${ingest_eps}, accepted=${accepted}, rejected=${rejected}, injected=$(( cycle % INJECT_EVERY == 0 ? 1 : 0 )), recovery_ok=${recovery_ok}"$'\n'
done

query_p95="$(percentile 95 "${latencies[@]}")"
query_p99="$(percentile 99 "${latencies[@]}")"
ingest_eps_p50="$(percentile 50 "${ingest_eps_values[@]}")"
ingest_eps_p95="$(percentile 95 "${ingest_eps_values[@]}")"
ingest_eps_min="$(printf "%s\n" "${ingest_eps_values[@]}" | sort -n | head -n1)"
ingest_eps_max="$(printf "%s\n" "${ingest_eps_values[@]}" | sort -n | tail -n1)"
error_rate="$(awk -v total="$query_total" -v err="$query_errors" 'BEGIN {
  if (total <= 0) { print "1.000000"; } else { printf "%.6f", err / total; }
}')"

cat >"$METRICS_FILE" <<EOF
# TYPE iridium_query_total counter
iridium_query_total $query_total
# TYPE iridium_query_errors counter
iridium_query_errors $query_errors
# TYPE iridium_query_p95_latency_micros gauge
iridium_query_p95_latency_micros $query_p95
# TYPE iridium_query_p99_latency_micros gauge
iridium_query_p99_latency_micros $query_p99
EOF

gate_status=0
MAX_ERROR_RATE="$MAX_ERROR_RATE" \
MAX_P95_MICROS="$MAX_P95_MICROS" \
MAX_P99_MICROS="$MAX_P99_MICROS" \
OUT_JSON="$GATE_JSON" OUT_MD="$GATE_MD" \
  bash "$SLO_GATE_SCRIPT" "$METRICS_FILE" >/dev/null || gate_status=$?

overall_pass=true
if [[ "$gate_status" -ne 0 || "$injection_recover_failures" -ne 0 ]]; then
  overall_pass=false
fi

cat >"$REPORT_JSON" <<EOF
{
  "timestamp_utc": "$TIMESTAMP_UTC",
  "commit_sha": "$COMMIT_SHA",
  "soak_cycles": $SOAK_CYCLES,
  "ingest_count": $INGEST_COUNT,
  "ingest_rounds": $INGEST_ROUNDS,
  "inject_every": $INJECT_EVERY,
  "totals": {
    "query_total": $query_total,
    "query_errors": $query_errors,
    "error_rate": $error_rate,
    "ingest_accepted": $total_ingest_accepted,
    "ingest_rejected": $total_ingest_rejected
  },
  "query_latency": {
    "p95_micros": $query_p95,
    "p99_micros": $query_p99
  },
  "ingest_events_per_sec": {
    "min": $ingest_eps_min,
    "p50": $ingest_eps_p50,
    "p95": $ingest_eps_p95,
    "max": $ingest_eps_max
  },
  "failure_injection": {
    "attempts": $injection_attempts,
    "recover_failures": $injection_recover_failures
  },
  "thresholds": {
    "max_error_rate": $MAX_ERROR_RATE,
    "max_p95_latency_micros": $MAX_P95_MICROS,
    "max_p99_latency_micros": $MAX_P99_MICROS
  },
  "slo_gate_pass": $( [[ "$gate_status" -eq 0 ]] && echo "true" || echo "false" ),
  "overall_pass": $overall_pass
}
EOF

cat >"$REPORT_MD" <<EOF
# Soak + Failure Injection Report

- Timestamp (UTC): $TIMESTAMP_UTC
- Commit: $COMMIT_SHA
- Soak cycles: $SOAK_CYCLES
- Inject every: $INJECT_EVERY cycles

## Totals
- query_total: $query_total
- query_errors: $query_errors
- error_rate: $error_rate
- ingest_accepted: $total_ingest_accepted
- ingest_rejected: $total_ingest_rejected

## Query Latency
- p95 (micros): $query_p95
- p99 (micros): $query_p99

## Ingest Events/Sec
- min: $ingest_eps_min
- p50: $ingest_eps_p50
- p95: $ingest_eps_p95
- max: $ingest_eps_max

## Failure Injection
- attempts: $injection_attempts
- recover_failures: $injection_recover_failures

## Per-Cycle Summary
$cycle_rows
## Thresholds
- max_error_rate: $MAX_ERROR_RATE
- max_p95_latency_micros: $MAX_P95_MICROS
- max_p99_latency_micros: $MAX_P99_MICROS

## Gates
- slo_gate_pass: $( [[ "$gate_status" -eq 0 ]] && echo "true" || echo "false" )
- overall_pass: $overall_pass
EOF

echo "Wrote:"
echo "  $REPORT_JSON"
echo "  $REPORT_MD"
echo "  $METRICS_FILE"
echo "  $GATE_JSON"
echo "  $GATE_MD"

if [[ "$overall_pass" != "true" ]]; then
  echo "soak/failure check failed thresholds or recovery checks" >&2
  exit 1
fi
