wave-api 0.1.0 - Docs.rs

#!/usr/bin/env python3
"""Scrape Wave developer documentation and save as markdown files."""

import os
import re
import time
import requests
from bs4 import BeautifulSoup
from markdownify import markdownify as md

BASE_URL = "https://developer.waveapps.com"
CATEGORY_URL = f"{BASE_URL}/hc/en-us/categories/360001114072"
OUTPUT_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "documentation")
DELAY = 1.0  # seconds between requests

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/120.0.0.0 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
}

SECTION_URLS = [
    "/hc/en-us/sections/360002680071-Get-Started",
    "/hc/en-us/sections/360003012132-Create-an-App",
    "/hc/en-us/sections/360006441372-Examples",
    "/hc/en-us/sections/360003025751-Schema",
]


def fetch(url):
    """Fetch a URL with browser-like headers and return the response."""
    resp = requests.get(url, headers=HEADERS, timeout=30)
    resp.raise_for_status()
    return resp


def slugify(title):
    """Convert a title to a filesystem-safe slug."""
    slug = title.lower().strip()
    slug = re.sub(r"[^a-z0-9]+", "-", slug)
    slug = slug.strip("-")
    return slug


def discover_articles():
    """Discover all articles organized by section."""
    sections = {}
    for section_path in SECTION_URLS:
        url = BASE_URL + section_path
        print(f"Fetching section: {url}")
        resp = fetch(url)
        soup = BeautifulSoup(resp.text, "html.parser")

        # Get section title
        h1 = soup.select_one("h1")
        # Fallback: extract name from URL slug (e.g. "360002680071-Get-Started" -> "Get Started")
        slug_part = section_path.rsplit("/", 1)[-1]
        fallback_title = re.sub(r"^\d+-", "", slug_part).replace("-", " ")
        section_title = h1.get_text(strip=True) if h1 else fallback_title

        # Find all article links
        articles = []
        seen = set()
        for a in soup.select('a[href*="/hc/en-us/articles/"]'):
            href = a["href"]
            text = a.get_text(strip=True)
            if text and href not in seen:
                seen.add(href)
                if not href.startswith("http"):
                    href = BASE_URL + href
                articles.append({"title": text, "url": href})

        sections[section_title] = articles
        print(f"  Found {len(articles)} articles")
        time.sleep(DELAY)

    return sections


def scrape_article(url):
    """Scrape a single article and return its title and markdown content."""
    resp = fetch(url)
    soup = BeautifulSoup(resp.text, "html.parser")

    # Get the article title
    title_el = soup.select_one("h1.article-title, h1")
    title = title_el.get_text(strip=True) if title_el else "Untitled"

    # Get the article body
    body = soup.select_one("div.article-body, article .article-body, .article-content")
    if not body:
        # Fallback: try the main content area
        body = soup.select_one("article, main, .content")

    if body:
        # Remove navigation elements, footers, etc. from the body
        for el in body.select("nav, footer, .article-footer, .article-sidebar"):
            el.decompose()

        # Convert to markdown
        content = md(str(body), heading_style="ATX", code_language="graphql")
        # Clean up excessive blank lines
        content = re.sub(r"\n{3,}", "\n\n", content)
    else:
        content = "*No content found*"

    return title, content


def save_article(title, content, filename):
    """Save article content as a markdown file."""
    filepath = os.path.join(OUTPUT_DIR, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(f"# {title}\n\n")
        f.write(content.strip())
        f.write("\n")
    return filepath


def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print("Discovering articles...")
    sections = discover_articles()

    # Scrape each article
    all_articles = []  # (section, title, filename)
    for section_title, articles in sections.items():
        print(f"\n--- {section_title} ---")
        for article in articles:
            print(f"  Scraping: {article['title']}")
            try:
                title, content = scrape_article(article["url"])
                filename = slugify(title) + ".md"
                save_article(title, content, filename)
                all_articles.append((section_title, title, filename))
                print(f"    -> Saved: {filename}")
            except Exception as e:
                print(f"    ERROR: {e}")
            time.sleep(DELAY)

    # Generate index
    print("\nGenerating index.md...")
    index_lines = ["# Wave Developer Documentation\n"]
    index_lines.append("Scraped from [Wave Developer Portal]"
                       "(https://developer.waveapps.com/hc/en-us/categories/360001114072)\n")

    current_section = None
    for section, title, filename in all_articles:
        if section != current_section:
            index_lines.append(f"\n## {section}\n")
            current_section = section
        index_lines.append(f"- [{title}]({filename})")

    index_path = os.path.join(OUTPUT_DIR, "index.md")
    with open(index_path, "w", encoding="utf-8") as f:
        f.write("\n".join(index_lines) + "\n")

    print(f"\nDone! Saved {len(all_articles)} articles to {OUTPUT_DIR}/")
    print(f"Index: {index_path}")


if __name__ == "__main__":
    main()