from __future__ import annotations
import re
import sys
import urllib.parse
import urllib.request
from html.parser import HTMLParser
USER_AGENT = "forensic-catalog-feed-watcher/0.3 (+https://github.com/SecurityRonin/forensicnomicon)"
_YT_PATTERNS: list[re.Pattern[str]] = [
re.compile(r"(?:youtube\.com/watch\?[^#]*v=)([A-Za-z0-9_-]{11})"),
re.compile(r"youtu\.be/([A-Za-z0-9_-]{11})"),
re.compile(r"youtube\.com/embed/([A-Za-z0-9_-]{11})"),
]
def extract_youtube_id(url: str) -> str | None:
for pattern in _YT_PATTERNS:
m = pattern.search(url)
if m:
return m.group(1)
return None
_HTML_TAG_RE = re.compile(r"<[^>]+>")
_TIMESTAMP_RE = re.compile(r"-->")
_SEQUENCE_RE = re.compile(r"^\d+$")
def _strip_vtt(vtt: str) -> str:
seen: set[str] = set()
parts: list[str] = []
for line in vtt.splitlines():
line = line.strip()
if not line:
continue
if line == "WEBVTT":
continue
if _TIMESTAMP_RE.search(line):
continue
if _SEQUENCE_RE.match(line):
continue
line = _HTML_TAG_RE.sub("", line).strip()
if line and line not in seen:
seen.add(line)
parts.append(line)
return " ".join(parts)
class _TextExtractor(HTMLParser):
_SKIP: frozenset[str] = frozenset(
{"script", "style", "nav", "header", "footer", "aside", "form"}
)
def __init__(self) -> None:
super().__init__()
self._depth = 0
self._parts: list[str] = []
def handle_starttag(self, tag: str, attrs: list) -> None: if tag in self._SKIP:
self._depth += 1
def handle_endtag(self, tag: str) -> None:
if tag in self._SKIP and self._depth > 0:
self._depth -= 1
def handle_data(self, data: str) -> None:
if self._depth == 0:
text = data.strip()
if text:
self._parts.append(text)
def get_text(self) -> str:
return " ".join(self._parts)
def fetch_youtube_transcript(video_id: str) -> str | None:
url = (
f"https://www.youtube.com/api/timedtext"
f"?v={video_id}&lang=en&fmt=vtt"
)
try:
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=15) as resp:
raw = resp.read().decode("utf-8", errors="replace")
if not raw.strip():
return None
return _strip_vtt(raw) or None
except Exception:
return None
def fetch_page_text(url: str) -> str | None:
try:
req = urllib.request.Request(url, headers={"User-Agent": USER_AGENT})
with urllib.request.urlopen(req, timeout=15) as resp:
html = resp.read().decode("utf-8", errors="replace")
parser = _TextExtractor()
parser.feed(html)
return parser.get_text() or None
except Exception:
return None
_FF_NOISE_PATTERNS: list[re.Pattern[str]] = [
re.compile(r"/news/digital-forensics-round-up-", re.I),
re.compile(r"/news/forensic-focus-digest-", re.I),
re.compile(r"-acquires-", re.I),
re.compile(r"-partners-with-", re.I),
re.compile(r"-joins-", re.I),
]
def is_noise_url(url: str) -> bool:
parsed = urllib.parse.urlparse(url)
if "forensicfocus.com" not in parsed.netloc:
return False
return any(p.search(url) for p in _FF_NOISE_PATTERNS)
def fetch_transcript(url: str) -> str | None:
if not url:
return None
vid = extract_youtube_id(url)
if vid:
return fetch_youtube_transcript(vid)
parsed = urllib.parse.urlparse(url)
if "forensicfocus.com" in parsed.netloc and parsed.path.startswith("/podcast/"):
return fetch_page_text(url)
return None
def main() -> int:
if len(sys.argv) < 2:
print("usage: fetch_transcript.py <url>", file=sys.stderr)
return 1
result = fetch_transcript(sys.argv[1])
if result:
print(result)
return 0
print("No transcript available for this URL.", file=sys.stderr)
return 1
if __name__ == "__main__":
sys.exit(main())