import os
import re
import time
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md
BASE_URL = "https://developer.waveapps.com"
CATEGORY_URL = f"{BASE_URL}/hc/en-us/categories/360001114072"
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "documentation")
DELAY = 1.0
HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Connection": "keep-alive",
}
SECTION_URLS = [
"/hc/en-us/sections/360002680071-Get-Started",
"/hc/en-us/sections/360003012132-Create-an-App",
"/hc/en-us/sections/360006441372-Examples",
"/hc/en-us/sections/360003025751-Schema",
]
def fetch(url):
resp = requests.get(url, headers=HEADERS, timeout=30)
resp.raise_for_status()
return resp
def slugify(title):
slug = title.lower().strip()
slug = re.sub(r"[^a-z0-9]+", "-", slug)
slug = slug.strip("-")
return slug
def discover_articles():
sections = {}
for section_path in SECTION_URLS:
url = BASE_URL + section_path
print(f"Fetching section: {url}")
resp = fetch(url)
soup = BeautifulSoup(resp.text, "html.parser")
h1 = soup.select_one("h1")
slug_part = section_path.rsplit("/", 1)[-1]
fallback_title = re.sub(r"^\d+-", "", slug_part).replace("-", " ")
section_title = h1.get_text(strip=True) if h1 else fallback_title
articles = []
seen = set()
for a in soup.select('a[href*="/hc/en-us/articles/"]'):
href = a["href"]
text = a.get_text(strip=True)
if text and href not in seen:
seen.add(href)
if not href.startswith("http"):
href = BASE_URL + href
articles.append({"title": text, "url": href})
sections[section_title] = articles
print(f" Found {len(articles)} articles")
time.sleep(DELAY)
return sections
def scrape_article(url):
resp = fetch(url)
soup = BeautifulSoup(resp.text, "html.parser")
title_el = soup.select_one("h1.article-title, h1")
title = title_el.get_text(strip=True) if title_el else "Untitled"
body = soup.select_one("div.article-body, article .article-body, .article-content")
if not body:
body = soup.select_one("article, main, .content")
if body:
for el in body.select("nav, footer, .article-footer, .article-sidebar"):
el.decompose()
content = md(str(body), heading_style="ATX", code_language="graphql")
content = re.sub(r"\n{3,}", "\n\n", content)
else:
content = "*No content found*"
return title, content
def save_article(title, content, filename):
filepath = os.path.join(OUTPUT_DIR, filename)
with open(filepath, "w", encoding="utf-8") as f:
f.write(f"# {title}\n\n")
f.write(content.strip())
f.write("\n")
return filepath
def main():
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Discovering articles...")
sections = discover_articles()
all_articles = [] for section_title, articles in sections.items():
print(f"\n--- {section_title} ---")
for article in articles:
print(f" Scraping: {article['title']}")
try:
title, content = scrape_article(article["url"])
filename = slugify(title) + ".md"
save_article(title, content, filename)
all_articles.append((section_title, title, filename))
print(f" -> Saved: {filename}")
except Exception as e:
print(f" ERROR: {e}")
time.sleep(DELAY)
print("\nGenerating index.md...")
index_lines = ["# Wave Developer Documentation\n"]
index_lines.append("Scraped from [Wave Developer Portal]"
"(https://developer.waveapps.com/hc/en-us/categories/360001114072)\n")
current_section = None
for section, title, filename in all_articles:
if section != current_section:
index_lines.append(f"\n## {section}\n")
current_section = section
index_lines.append(f"- [{title}]({filename})")
index_path = os.path.join(OUTPUT_DIR, "index.md")
with open(index_path, "w", encoding="utf-8") as f:
f.write("\n".join(index_lines) + "\n")
print(f"\nDone! Saved {len(all_articles)} articles to {OUTPUT_DIR}/")
print(f"Index: {index_path}")
if __name__ == "__main__":
main()