import argparse
import json
import re
import requests
class Category:
def __init__(self, code: int, name: str):
self.code = code
self.name = name
self.subcategories: list[tuple[int, str]] = []
def add_subcategory(self, code: int, name: str):
self.subcategories.append((code, name))
class Data:
def __init__(self, categories: list[Category], trackers: list[str]):
self.categories = categories
self.trackers = trackers
def get_data(mirror: str = "https://thepiratebay.org") -> Data:
response = requests.get(mirror + "/static/main.js", allow_redirects=False)
response.raise_for_status()
matches = re.findall(r"category:(\d{3})[^>]*>([^<]+)<", response.text)
flat = [(int(code), name.strip()) for code, name in matches]
categories: list[Category] = []
for code, name in flat:
if code % 100 == 0:
categories.append(Category(code, name))
else:
categories[-1].add_subcategory(code, name)
trackers = re.findall(r"encodeURIComponent\('(udp://[^']+)'", response.text)
return Data(categories, trackers)
def print_data(data: Data):
print("Categories:")
for category in data.categories:
print(f" {category.code} {category.name}")
for code, name in category.subcategories:
print(f" {code} {category.name}: {name}")
print("\nTrackers:")
for tracker in data.trackers:
print(f" {tracker}")
def json_dump(data: Data):
categories = [
{
"code": cat.code,
"name": cat.name,
"subcategories": [
{"code": code, "name": name} for code, name in cat.subcategories
],
}
for cat in data.categories
]
print(json.dumps({"categories": categories, "trackers": data.trackers}, indent=4))
def codegen(data: Data):
gen = "//! This file was generated automatically by scraper.py.\n\n"
flat_cats: list[tuple[int, str]] = []
for category in data.categories:
flat_cats.append((category.code, category.name))
for code, name in category.subcategories:
flat_cats.append((code, f"{category.name}: {name}"))
gen += "/// Category codes and associated names.\n"
gen += f"pub const CATEGORIES: [(u16, &str); {len(flat_cats)}] = [\n"
for code, name in flat_cats:
gen += f" ({code}, \"{name}\"),\n"
gen += "];\n\n"
gen += "/// Trackers used in magnet links on thepiratebay.org.\n"
gen += f"pub const TRACKERS: [&str; {len(data.trackers)}] = [\n"
for tracker in data.trackers:
gen += f" \"{tracker}\",\n"
gen += "];\n"
print(gen)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Get content categories and trackers from thepiratebay.org"
)
parser.add_argument(
"-m", "--mirror", default="https://thepiratebay.org", help="Mirror to use"
)
output = parser.add_mutually_exclusive_group()
output.add_argument("-t", "--text", action="store_true", help="Output as text")
output.add_argument("-j", "--json", action="store_true", help="Output as JSON")
output.add_argument("-c", "--codegen", action="store_true", help="Output Rust code")
args = parser.parse_args()
data = get_data(args.mirror)
if args.json:
json_dump(data)
elif args.codegen:
codegen(data)
else:
print_data(data)