#!/usr/bin/env bash
#
# Promote OS/ISP-firewall-confirmed discovered hosts into the STATIC blocklist.
#
# The runtime feedback loop records hosts that physical peers confirmed are
# blocked by a native (OS / ISP) firewall — `discovered_bad_hosts.block = true`.
# This script exports those into `spider-rs/bad_websites/websites.txt`, which
# `spider_firewall`'s build.rs already fetches and bakes into the static FST, so
# the NEXT release ships them as built-in static entries.
#
# Properties:
#   * Review-gated — opens a PR, never auto-merges.
#   * Idempotent   — appends only hosts not already in websites.txt (minimal diff);
#                    re-running with nothing new exits without a PR.
#   * Safe         — read-only against the DB; no writes to the discovered tables.
#
# Requirements: `psql`, `git`, and `gh` (authenticated).
#
# Usage:
#   DISCOVERED_DB_URL='postgres://…spider_projects…' ./export_discovered_hosts.sh
#
# Env:
#   DISCOVERED_DB_URL  (required) Postgres URL of the spider_projects DB.
#   MIN_HITS           (default 2) require >= N report events before promoting,
#                      so a single transient block can't reach the static list.
#   BAD_WEBSITES_REPO  (default spider-rs/bad_websites) the curated source repo.
set -euo pipefail

DB_URL="${DISCOVERED_DB_URL:?set DISCOVERED_DB_URL to the spider_projects Postgres URL}"
MIN_HITS="${MIN_HITS:-2}"
REPO="${BAD_WEBSITES_REPO:-spider-rs/bad_websites}"
BRANCH="discovered-export-$(date +%Y%m%d-%H%M%S)"

work="$(mktemp -d)"
trap 'rm -rf "$work"' EXIT

echo "[export] pulling block=true hosts (hit_count >= ${MIN_HITS}) from the DB…"
psql "$DB_URL" -Atq -c \
  "SELECT host FROM public.discovered_bad_hosts
     WHERE block = true AND hit_count >= ${MIN_HITS}
     ORDER BY host" \
  | sed '/^$/d' | sort -u > "$work/confirmed.txt"

confirmed="$(wc -l < "$work/confirmed.txt" | tr -d ' ')"
echo "[export] ${confirmed} firewall-confirmed host(s) in the DB"
if [ "$confirmed" -eq 0 ]; then
  echo "[export] nothing to export — done."
  exit 0
fi

echo "[export] cloning ${REPO}…"
gh repo clone "$REPO" "$work/repo" -- --depth 1 >/dev/null 2>&1
cd "$work/repo"
touch websites.txt

# Append only the hosts not already present (preserve existing order → tiny diff).
comm -13 <(sort -u websites.txt) "$work/confirmed.txt" > "$work/added.txt"
added="$(wc -l < "$work/added.txt" | tr -d ' ')"
if [ "$added" -eq 0 ]; then
  echo "[export] websites.txt already contains every confirmed host — no PR needed."
  exit 0
fi

cat "$work/added.txt" >> websites.txt
echo "[export] appending ${added} new host(s) to websites.txt"

git checkout -q -b "$BRANCH"
git add websites.txt
git commit -q -F - <<MSG
discovered: add ${added} OS/ISP-firewall-confirmed hosts

Exported from spider_projects \`discovered_bad_hosts\` where block=true and
hit_count >= ${MIN_HITS} — hosts physical peers confirmed are blocked by a
native (OS / ISP) firewall. spider_firewall's build.rs ingests websites.txt
into the static FST on the next release.

Auto-generated by spider_firewall/scripts/export_discovered_hosts.sh.
MSG
git push -q -u origin "$BRANCH"

gh pr create --repo "$REPO" \
  --title "discovered: +${added} firewall-confirmed hosts" \
  --body "Adds **${added}** host(s) confirmed blocked by OS/ISP native firewalls by physical peers (\`discovered_bad_hosts.block = true\`, \`hit_count >= ${MIN_HITS}\`).

Source: the spider_firewall runtime feedback loop. \`spider_firewall\`'s build.rs already fetches \`websites.txt\` and bakes it into the static FST, so merging promotes these to built-in static entries on the next release. Review before merge." \
  >/dev/null

echo "[export] ✓ PR opened on ${REPO} with ${added} host(s)."
