#!/usr/bin/env bash
set -euo pipefail

ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$ROOT_DIR"

ARTIFACTS_DIR="${ARTIFACTS_DIR:-$ROOT_DIR/artifacts}"
QUERY_RUNS="${QUERY_RUNS:-25}"
INGEST_BATCH_COUNT="${INGEST_BATCH_COUNT:-2000}"
INGEST_BATCH_ROUNDS="${INGEST_BATCH_ROUNDS:-5}"
REPORT_PREFIX="${REPORT_PREFIX:-perf_baseline}"

mkdir -p "$ARTIFACTS_DIR"
LOG_DIR="$ARTIFACTS_DIR/${REPORT_PREFIX}_logs"
REPORT_JSON="$ARTIFACTS_DIR/${REPORT_PREFIX}_report.json"
REPORT_MD="$ARTIFACTS_DIR/${REPORT_PREFIX}_report.md"
VALIDATION_JSON="$ARTIFACTS_DIR/workload_validation_report.json"
mkdir -p "$LOG_DIR"

TIMESTAMP_UTC="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
COMMIT_SHA="$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")"

extract_json_number() {
  local key="$1"
  local file="$2"
  grep -E "\"$key\"[[:space:]]*:" "$file" | head -n1 | sed -E 's/.*: ([^,}]+).*/\1/'
}

percentile() {
  local p="$1"
  shift
  local values=("$@")
  local n="${#values[@]}"
  if [[ "$n" -eq 0 ]]; then
    echo 0
    return
  fi
  local sorted_str
  sorted_str="$(printf "%s\n" "${values[@]}" | sort -n)"
  local sorted=()
  while IFS= read -r line; do
    sorted+=("$line")
  done <<<"$sorted_str"
  local rank=$(( (p * n + 99) / 100 ))
  if [[ "$rank" -lt 1 ]]; then
    rank=1
  fi
  local idx=$((rank - 1))
  echo "${sorted[$idx]}"
}

now_ns() {
  perl -MTime::HiRes=time -e 'printf "%.0f\n", time() * 1000000000'
}

capture_query_latencies() {
  local workdir="$1"
  local query_name="$2"
  local query_str="$3"

  for run in $(seq 1 "$QUERY_RUNS"); do
    local log_file="$LOG_DIR/${query_name}_run_${run}.log"
    if ! (
      cd "$workdir"
      "$ROOT_DIR/target/debug/ir" query "$query_str"
    ) >"$log_file" 2>&1; then
      echo "query run failed for $query_name (run $run), see $log_file" >&2
      exit 1
    fi
    local latency
    latency="$(extract_json_number "latency_micros" "$log_file" || echo 0)"
    echo "$latency"
  done
}

echo "Refreshing workload validation report for workload baselines..."
"$ROOT_DIR/scripts/workload_validation.sh" >/dev/null

echo "Building ir binary..."
cargo build --bin ir >/dev/null

WORKDIR="$(mktemp -d)"
trap 'rm -rf "$WORKDIR"' EXIT

echo "Preparing workload dataset in $WORKDIR..."
for id in $(seq 1 500); do
  n1=$((id + 1))
  n2=$((id + 2))
  (
    cd "$WORKDIR"
    "$ROOT_DIR/target/debug/ir" ingest-node "$id" 1 "$n1,$n2"
  ) >/dev/null
done

QUERY_FRAUD='MATCH (n) WHERE vector.cosine(n.embedding, $vec) > 0.80 RETURN n LIMIT 20'
QUERY_RECO='MATCH (n) RETURN n LIMIT 10'
QUERY_SUPPLY='MATCH (n) WHERE vector.cosine(n.embedding, $vec) > 0.20 RETURN n LIMIT 50'

echo "Capturing query latencies over $QUERY_RUNS runs per workload..."
FRAUD_LATENCIES=()
RECO_LATENCIES=()
SUPPLY_LATENCIES=()
FRAUD_LATENCIES=($(capture_query_latencies "$WORKDIR" "fraud" "$QUERY_FRAUD"))
RECO_LATENCIES=($(capture_query_latencies "$WORKDIR" "recommendation" "$QUERY_RECO"))
SUPPLY_LATENCIES=($(capture_query_latencies "$WORKDIR" "supply_chain" "$QUERY_SUPPLY"))

FRAUD_P50="$(percentile 50 "${FRAUD_LATENCIES[@]}")"
FRAUD_P95="$(percentile 95 "${FRAUD_LATENCIES[@]}")"
FRAUD_P99="$(percentile 99 "${FRAUD_LATENCIES[@]}")"
RECO_P50="$(percentile 50 "${RECO_LATENCIES[@]}")"
RECO_P95="$(percentile 95 "${RECO_LATENCIES[@]}")"
RECO_P99="$(percentile 99 "${RECO_LATENCIES[@]}")"
SUPPLY_P50="$(percentile 50 "${SUPPLY_LATENCIES[@]}")"
SUPPLY_P95="$(percentile 95 "${SUPPLY_LATENCIES[@]}")"
SUPPLY_P99="$(percentile 99 "${SUPPLY_LATENCIES[@]}")"

echo "Capturing batched ingest throughput..."
INGEST_ACCEPTED_TOTAL=0
INGEST_REJECTED_TOTAL=0
INGEST_START_NS="$(now_ns)"
out="$(
  cd "$WORKDIR"
  "$ROOT_DIR/target/debug/ir" ingest-batch-edge-loop 100000 "$INGEST_BATCH_COUNT" "$INGEST_BATCH_ROUNDS" 1 perf 2>&1
)"
accepted="$(echo "$out" | sed -nE 's/.*accepted=([0-9]+).*/\1/p' | head -n1)"
rejected="$(echo "$out" | sed -nE 's/.*rejected=([0-9]+).*/\1/p' | head -n1)"
if [[ -z "$accepted" || -z "$rejected" ]]; then
  echo "failed to parse ingest result: $out" >&2
  exit 1
fi
INGEST_ACCEPTED_TOTAL=$accepted
INGEST_REJECTED_TOTAL=$rejected
INGEST_END_NS="$(now_ns)"
INGEST_ELAPSED_NS=$((INGEST_END_NS - INGEST_START_NS))
INGEST_ELAPSED_SEC="$(awk -v ns="$INGEST_ELAPSED_NS" 'BEGIN { printf "%.6f", ns / 1000000000.0 }')"
INGEST_EVENTS_PER_SEC="$(awk -v events="$INGEST_ACCEPTED_TOTAL" -v sec="$INGEST_ELAPSED_SEC" 'BEGIN { if (sec <= 0) { print "0.00"; } else { printf "%.2f", events / sec; } }')"

VAL_FRAUD_LATENCY="$(extract_json_number "latency_micros" "$ARTIFACTS_DIR/validation_cmd_fraud.log" || echo 0)"
VAL_RECO_LATENCY="$(extract_json_number "latency_micros" "$ARTIFACTS_DIR/validation_cmd_reco.log" || echo 0)"
VAL_SUPPLY_LATENCY="$(extract_json_number "latency_micros" "$ARTIFACTS_DIR/validation_cmd_supply.log" || echo 0)"
VAL_OVERALL_PASS="$(grep -E '"overall_pass"[[:space:]]*:' "$VALIDATION_JSON" | head -n1 | sed -E 's/.*: (true|false).*/\1/' || echo "false")"

cat >"$REPORT_JSON" <<EOF
{
  "timestamp_utc": "$TIMESTAMP_UTC",
  "commit_sha": "$COMMIT_SHA",
  "config": {
    "query_runs": $QUERY_RUNS,
    "ingest_batch_count": $INGEST_BATCH_COUNT,
    "ingest_batch_rounds": $INGEST_BATCH_ROUNDS
  },
  "query_latency_micros": {
    "fraud": {"p50": $FRAUD_P50, "p95": $FRAUD_P95, "p99": $FRAUD_P99},
    "recommendation": {"p50": $RECO_P50, "p95": $RECO_P95, "p99": $RECO_P99},
    "supply_chain": {"p50": $SUPPLY_P50, "p95": $SUPPLY_P95, "p99": $SUPPLY_P99}
  },
  "ingest_throughput": {
    "accepted_events": $INGEST_ACCEPTED_TOTAL,
    "rejected_events": $INGEST_REJECTED_TOTAL,
    "elapsed_seconds": $INGEST_ELAPSED_SEC,
    "events_per_sec": $INGEST_EVENTS_PER_SEC
  },
  "workload_validation_reference": {
    "overall_pass": $VAL_OVERALL_PASS,
    "fraud_latency_micros": $VAL_FRAUD_LATENCY,
    "recommendation_latency_micros": $VAL_RECO_LATENCY,
    "supply_chain_latency_micros": $VAL_SUPPLY_LATENCY
  }
}
EOF

cat >"$REPORT_MD" <<EOF
# Performance Baseline Report

- Timestamp (UTC): $TIMESTAMP_UTC
- Commit: $COMMIT_SHA

## Config
- query_runs: $QUERY_RUNS
- ingest_batch_count: $INGEST_BATCH_COUNT
- ingest_batch_rounds: $INGEST_BATCH_ROUNDS

## Query Latency (micros)
- Fraud: p50=$FRAUD_P50 p95=$FRAUD_P95 p99=$FRAUD_P99
- Recommendation: p50=$RECO_P50 p95=$RECO_P95 p99=$RECO_P99
- Supply-chain: p50=$SUPPLY_P50 p95=$SUPPLY_P95 p99=$SUPPLY_P99

## Ingest Throughput
- accepted_events=$INGEST_ACCEPTED_TOTAL
- rejected_events=$INGEST_REJECTED_TOTAL
- elapsed_seconds=$INGEST_ELAPSED_SEC
- events_per_sec=$INGEST_EVENTS_PER_SEC

## Validation Reference
- workload_validation.overall_pass=$VAL_OVERALL_PASS
- fraud_latency_micros=$VAL_FRAUD_LATENCY
- recommendation_latency_micros=$VAL_RECO_LATENCY
- supply_chain_latency_micros=$VAL_SUPPLY_LATENCY
EOF

echo "Wrote:"
echo "  $REPORT_JSON"
echo "  $REPORT_MD"
