from __future__ import annotations
import argparse
import json
import os
import re
import sys
import time
import urllib.error
import urllib.request
USER_AGENT = (
"forensic-catalog-gtfobins-fetcher/0.1 "
"(+https://github.com/SecurityRonin/forensicnomicon)"
)
GITHUB_API = "https://api.github.com/repos/GTFOBins/GTFOBins.github.io"
GTFOBINS_BASE = "https://gtfobins.github.io/gtfobins"
DELAY = 0.3
def fetch(url: str, *, retries: int = 2) -> bytes:
req = urllib.request.Request(
url,
headers={
"User-Agent": USER_AGENT,
"Accept": "application/vnd.github+json",
},
)
last_exc: Exception | None = None
for attempt in range(retries + 1):
try:
with urllib.request.urlopen(req, timeout=30) as resp:
return resp.read()
except (urllib.error.HTTPError, urllib.error.URLError) as exc:
last_exc = exc
if isinstance(exc, urllib.error.HTTPError) and exc.code == 403:
print(" GitHub rate-limited — sleeping 60s", file=sys.stderr)
time.sleep(60)
else:
print(f" attempt {attempt + 1}: {exc}", file=sys.stderr)
if attempt < retries:
time.sleep(DELAY)
raise RuntimeError(f"failed to fetch {url}: {last_exc}") from last_exc
def list_gtfobin_names() -> list[str]:
url = f"{GITHUB_API}/contents/_gtfobins"
raw = fetch(url)
files = json.loads(raw.decode("utf-8"))
names: list[str] = []
for f in files:
fname = f.get("name", "")
if fname.endswith(".md"):
names.append(fname[:-3]) return sorted(names)
def parse_functions_from_frontmatter(content: str) -> list[str]:
match = re.match(r"^---\s*\n(.*?\n)---", content, re.DOTALL)
if not match:
return []
yaml_block = match.group(1)
funcs: list[str] = []
in_functions = False
for line in yaml_block.splitlines():
stripped = line.strip()
if stripped == "functions:":
in_functions = True
elif in_functions:
if stripped.startswith("- "):
funcs.append(stripped[2:].strip())
elif stripped and not stripped.startswith("#"):
in_functions = False
return funcs
def scrape_all(names: list[str], token: str | None = None) -> list[dict]:
entries: list[dict] = []
headers: dict[str, str] = {"User-Agent": USER_AGENT}
if token:
headers["Authorization"] = f"token {token}"
for i, name in enumerate(names):
url = f"https://raw.githubusercontent.com/GTFOBins/GTFOBins.github.io/master/_gtfobins/{name}.md"
if i > 0:
time.sleep(DELAY)
try:
req = urllib.request.Request(url, headers=headers)
with urllib.request.urlopen(req, timeout=15) as resp:
content = resp.read().decode("utf-8", errors="replace")
except Exception as exc:
print(f" warn: skipping {name}: {exc}", file=sys.stderr)
continue
funcs = parse_functions_from_frontmatter(content)
entries.append({
"name": name,
"functions": funcs,
"url": f"{GTFOBINS_BASE}/{name}/",
})
if (i + 1) % 50 == 0:
print(f" scraped {i + 1}/{len(names)} …", file=sys.stderr)
return entries
def parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(description=__doc__)
p.add_argument("--dry-run", action="store_true",
help="list names only (no fetch of individual pages)")
p.add_argument("--output", default=None,
help="output path (default: archive/sources/gtfobins_linux.json)")
p.add_argument("--token", default=None,
help="GitHub personal access token to raise rate-limit ceiling")
return p.parse_args()
def default_output_path() -> str:
script_dir = os.path.dirname(os.path.abspath(__file__))
return os.path.normpath(
os.path.join(script_dir, "..", "archive", "sources", "gtfobins_linux.json")
)
def main() -> int:
args = parse_args()
output_path = args.output or default_output_path()
print("fetching GTFOBins file list …", file=sys.stderr)
try:
names = list_gtfobin_names()
except RuntimeError as exc:
print(f"error: {exc}", file=sys.stderr)
return 1
print(f"found {len(names)} binaries", file=sys.stderr)
if args.dry_run:
print(json.dumps(names[:10], indent=2))
return 0
print("fetching individual pages …", file=sys.stderr)
entries = scrape_all(names, token=args.token)
if not entries:
print("error: no entries scraped", file=sys.stderr)
return 1
print(f"scraped {len(entries)} entries", file=sys.stderr)
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, "w", encoding="utf-8") as fh:
json.dump(entries, fh, indent=2, ensure_ascii=False)
fh.write("\n")
print(f"wrote {len(entries)} entries to {output_path}", file=sys.stderr)
return 0
if __name__ == "__main__":
raise SystemExit(main())