decruft 0.1.3 - Docs.rs

use std::fmt::Write;

use scraper::{Html, Selector};

use crate::dom;

use super::ExtractorResult;
use super::comments::{CommentData, build_comment_tree, build_content_html};

/// Detect whether this page is a GitHub issue or pull request.
#[must_use]
pub fn is_github(html: &Html, url: Option<&str>) -> bool {
    let has_meta = has_github_meta(html);
    let is_issue = url.is_some_and(is_issue_url) || has_issue_markers(html);
    let is_pr = url.is_some_and(is_pr_url) || has_pr_markers(html);
    // Accept URL-based detection even without meta tags (API fallback for JS-rendered pages)
    (has_meta || url.is_some_and(|u| u.contains("github.com/"))) && (is_issue || is_pr)
}

/// Extract content from a GitHub issue or pull request page.
///
/// When `include_replies` is false, comments are omitted.
/// Falls back to the GitHub REST API when the HTML yields fewer
/// than 10 words (common with JS-rendered React pages).
#[must_use]
pub fn extract_github(
    html: &Html,
    url: Option<&str>,
    include_replies: bool,
) -> Option<ExtractorResult> {
    if !is_github(html, url) {
        return None;
    }

    let is_pr = url.is_some_and(is_pr_url) || has_pr_markers(html);
    let result = if is_pr {
        extract_pr(html, url, include_replies)
    } else {
        extract_issue(html, url, include_replies)
    };

    if dom::count_words_html(&result.content) < 10
        && let Some(api_result) = try_api_fetch(url, include_replies)
    {
        return Some(api_result);
    }

    Some(result)
}

fn has_github_meta(html: &Html) -> bool {
    let selectors = [
        "meta[name=\"expected-hostname\"][content=\"github.com\"]",
        "meta[name=\"octolytics-url\"]",
        "meta[name=\"github-keyboard-shortcuts\"]",
    ];
    selectors.iter().any(|s| {
        Selector::parse(s)
            .ok()
            .is_some_and(|sel| html.select(&sel).next().is_some())
    })
}

fn is_issue_url(url: &str) -> bool {
    url.contains("/issues/")
}

fn is_pr_url(url: &str) -> bool {
    url.contains("/pull/")
}

fn has_issue_markers(html: &Html) -> bool {
    let selectors = [
        "[data-testid=\"issue-metadata-sticky\"]",
        "[data-testid=\"issue-title\"]",
    ];
    selectors.iter().any(|s| {
        Selector::parse(s)
            .ok()
            .is_some_and(|sel| html.select(&sel).next().is_some())
    })
}

fn has_pr_markers(html: &Html) -> bool {
    let selectors = [
        ".pull-discussion-timeline",
        ".discussion-timeline",
        ".gh-header-title",
    ];
    selectors.iter().any(|s| {
        Selector::parse(s)
            .ok()
            .is_some_and(|sel| html.select(&sel).next().is_some())
    })
}

fn extract_repo_info(url: Option<&str>) -> (String, String) {
    use std::sync::LazyLock;
    static REPO_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
        regex::Regex::new(r"github\.com/([^/]+)/([^/]+)").expect("github repo regex is valid")
    });

    let Some(u) = url else {
        return (String::new(), String::new());
    };
    REPO_RE
        .captures(u)
        .map(|caps| {
            (
                caps.get(1)
                    .map_or(String::new(), |m| m.as_str().to_string()),
                caps.get(2)
                    .map_or(String::new(), |m| m.as_str().to_string()),
            )
        })
        .unwrap_or_default()
}

// --- Issue extraction ---

fn extract_issue(html: &Html, url: Option<&str>, include_replies: bool) -> ExtractorResult {
    let (_owner, _repo) = extract_repo_info(url);
    let title = extract_title(html);
    let (body, author) = extract_issue_body(html);
    let comments = if include_replies {
        extract_issue_comments(html)
    } else {
        String::new()
    };
    let content = build_content_html("github", &body, &comments);

    ExtractorResult {
        content,
        title: Some(title),
        author: if author.is_empty() {
            None
        } else {
            Some(author)
        },
        site: Some("GitHub".to_string()),
        published: None,
        image: None,
        description: None,
    }
}

fn extract_title(html: &Html) -> String {
    let sel = Selector::parse("title").ok();
    let raw = sel
        .and_then(|s| html.select(&s).next())
        .map(|el| dom::text_content(html, el.id()).trim().to_string())
        .unwrap_or_default();
    // Strip trailing " · owner/repo" from GitHub titles
    if let Some(idx) = raw.rfind(" · ") {
        let after = &raw[idx + " · ".len()..];
        // If the suffix contains a "/" it's an owner/repo path
        if after.contains('/') {
            return raw[..idx].to_string();
        }
    }
    raw
}

fn extract_issue_body(html: &Html) -> (String, String) {
    let container_sel = "[data-testid=\"issue-viewer-issue-container\"]";
    let container_ids = dom::select_ids(html, container_sel);
    let Some(&container_id) = container_ids.first() else {
        return (String::new(), String::new());
    };

    let author = extract_issue_author(html, container_id);

    let body_sel = "[data-testid=\"issue-body-viewer\"] [data-testid=\"markdown-body\"]";
    let body_ids = dom::select_ids(html, body_sel);
    let body = body_ids
        .first()
        .map(|&id| dom::inner_html(html, id))
        .unwrap_or_default();

    (body.trim().to_string(), author)
}

fn extract_issue_author(html: &Html, container_id: ego_tree::NodeId) -> String {
    let author_selectors = [
        "[data-testid=\"issue-body-header-author\"]",
        "a[data-testid=\"avatar-link\"]",
    ];
    for sel_str in &author_selectors {
        let ids = dom::select_within(html, container_id, sel_str);
        if let Some(&id) = ids.first() {
            // Prefer text content (the displayed username)
            let text = dom::text_content(html, id);
            let trimmed = text.trim();
            if !trimmed.is_empty() {
                return trimmed.to_string();
            }
            // Fall back to href, extracting username from path
            if let Some(href) = dom::get_attr(html, id, "href") {
                let name = href
                    .strip_prefix("https://github.com/")
                    .or_else(|| href.strip_prefix('/'))
                    .unwrap_or(&href);
                if !name.is_empty() {
                    return name.to_string();
                }
            }
        }
    }
    String::new()
}

fn extract_issue_comments(html: &Html) -> String {
    let timeline_sel = "[data-wrapper-timeline-id]";
    let timeline_ids = dom::select_ids(html, timeline_sel);
    let mut comments = Vec::new();

    for &timeline_id in &timeline_ids {
        if let Some(comment) = extract_single_issue_comment(html, timeline_id) {
            comments.push(comment);
        }
    }

    if comments.is_empty() {
        return String::new();
    }
    build_comment_tree(&comments)
}

fn extract_single_issue_comment(html: &Html, timeline_id: ego_tree::NodeId) -> Option<CommentData> {
    let react_ids = dom::select_within(html, timeline_id, ".react-issue-comment");
    let comment_container = react_ids.first().copied().unwrap_or(timeline_id);

    let author = extract_comment_author(html, comment_container);
    let date = extract_relative_time(html, comment_container);

    let body_ids = dom::select_within(html, comment_container, "[data-testid=\"markdown-body\"]");
    let body = body_ids
        .first()
        .map(|&id| dom::inner_html(html, id).trim().to_string())
        .unwrap_or_default();

    if body.is_empty() {
        return None;
    }

    Some(CommentData {
        author,
        date,
        content: body,
        depth: 0,
        score: None,
        url: None,
    })
}

fn extract_comment_author(html: &Html, container_id: ego_tree::NodeId) -> String {
    let selectors = [
        "[data-testid=\"avatar-link\"]",
        "a[href^=\"/\"][data-hovercard-url]",
    ];
    for sel_str in &selectors {
        let ids = dom::select_within(html, container_id, sel_str);
        if let Some(&id) = ids.first()
            && let Some(href) = dom::get_attr(html, id, "href")
        {
            let name = href.strip_prefix('/').unwrap_or(&href);
            if !name.is_empty() {
                return name.to_string();
            }
        }
    }
    "Unknown".to_string()
}

fn extract_relative_time(html: &Html, container_id: ego_tree::NodeId) -> String {
    let ids = dom::select_within(html, container_id, "relative-time");
    ids.first()
        .and_then(|&id| dom::get_attr(html, id, "datetime"))
        .and_then(|dt| dt.split('T').next().map(String::from))
        .unwrap_or_default()
}

// --- PR extraction ---

fn extract_pr(html: &Html, url: Option<&str>, include_replies: bool) -> ExtractorResult {
    let (_owner, _repo) = extract_repo_info(url);
    let title = extract_title(html);
    let (body, author) = extract_pr_body(html);
    let comments = if include_replies {
        extract_pr_comments(html)
    } else {
        String::new()
    };
    let content = build_content_html("github", &body, &comments);

    ExtractorResult {
        content,
        title: Some(title),
        author: if author.is_empty() {
            None
        } else {
            Some(author)
        },
        site: Some("GitHub".to_string()),
        published: None,
        image: None,
        description: None,
    }
}

fn extract_pr_body(html: &Html) -> (String, String) {
    let pr_sel = "[id^=\"pullrequest-\"]";
    let pr_ids = dom::select_ids(html, pr_sel);
    let pr_container = pr_ids.first().copied();

    let body_sel = ".comment-body.markdown-body";
    let body = if let Some(container_id) = pr_container {
        let ids = dom::select_within(html, container_id, body_sel);
        ids.first()
            .map(|&id| dom::inner_html(html, id).trim().to_string())
    } else {
        let ids = dom::select_ids(html, body_sel);
        ids.first()
            .map(|&id| dom::inner_html(html, id).trim().to_string())
    }
    .unwrap_or_default();

    let author = pr_container
        .map(|cid| extract_pr_author(html, cid))
        .unwrap_or_default();

    (body, author)
}

fn extract_pr_author(html: &Html, container_id: ego_tree::NodeId) -> String {
    let ids = dom::select_within(html, container_id, ".author");
    ids.first()
        .map(|&id| dom::text_content(html, id).trim().to_string())
        .unwrap_or_default()
}

fn extract_pr_comments(html: &Html) -> String {
    let comment_sel = ".timeline-comment, .review-comment";
    let all_ids = dom::select_ids(html, comment_sel);
    let pr_body_ids = dom::select_ids(html, "[id^=\"pullrequest-\"]");
    let pr_body_id = pr_body_ids.first().copied();

    let mut comments = Vec::new();
    for &cid in &all_ids {
        if pr_body_id.is_some_and(|pb| pb == cid || dom::is_ancestor(html, cid, pb)) {
            continue;
        }
        if let Some(comment) = extract_single_pr_comment(html, cid) {
            comments.push(comment);
        }
    }

    if comments.is_empty() {
        return String::new();
    }
    build_comment_tree(&comments)
}

fn extract_single_pr_comment(html: &Html, comment_id: ego_tree::NodeId) -> Option<CommentData> {
    let author = extract_pr_author(html, comment_id);
    let date = extract_relative_time(html, comment_id);

    let body_ids = dom::select_within(html, comment_id, ".comment-body.markdown-body");
    let body = body_ids
        .first()
        .map(|&id| dom::inner_html(html, id).trim().to_string())
        .unwrap_or_default();

    if body.is_empty() {
        return None;
    }

    Some(CommentData {
        author,
        date,
        content: body,
        depth: 0,
        score: None,
        url: None,
    })
}

// --- API fallback ---

/// Parse a GitHub issue/PR URL into (owner, repo, number, `is_pr`).
fn parse_github_url(url: &str) -> Option<(String, String, String, bool)> {
    use std::sync::LazyLock;
    static RE: LazyLock<regex::Regex> = LazyLock::new(|| {
        regex::Regex::new(r"github\.com/([^/]+)/([^/]+)/(issues|pull)/(\d+)")
            .expect("github url regex is valid")
    });

    let caps = RE.captures(url)?;
    let owner = caps.get(1)?.as_str().to_string();
    let repo = caps.get(2)?.as_str().to_string();
    let kind = caps.get(3)?.as_str();
    let number = caps.get(4)?.as_str().to_string();
    Some((owner, repo, number, kind == "pull"))
}

/// Fetch content from the GitHub REST API when HTML extraction fails.
fn try_api_fetch(url: Option<&str>, include_replies: bool) -> Option<ExtractorResult> {
    let (owner, repo, number, is_pr) = parse_github_url(url?)?;

    let endpoint = if is_pr { "pulls" } else { "issues" };
    let api_url = format!("https://api.github.com/repos/{owner}/{repo}/{endpoint}/{number}");
    let json = fetch_github_json(&api_url)?;

    let title = json_str(&json, "title");
    let body = json_str(&json, "body");
    let author = json
        .get("user")
        .and_then(|u| u.get("login"))
        .and_then(serde_json::Value::as_str)
        .unwrap_or("")
        .to_string();
    let published = json_str(&json, "created_at")
        .split('T')
        .next()
        .unwrap_or("")
        .to_string();

    let body_html = markdown_to_html(&body);
    let comments_html = if include_replies {
        fetch_api_comments(&owner, &repo, &number)
    } else {
        String::new()
    };
    let content = build_content_html("github", &body_html, &comments_html);

    Some(ExtractorResult {
        content,
        title: if title.is_empty() { None } else { Some(title) },
        author: if author.is_empty() {
            None
        } else {
            Some(author)
        },
        site: Some("GitHub".to_string()),
        published: if published.is_empty() {
            None
        } else {
            Some(published)
        },
        image: None,
        description: None,
    })
}

/// Fetch and format comments from the GitHub Issues API.
fn fetch_api_comments(owner: &str, repo: &str, number: &str) -> String {
    let url = format!("https://api.github.com/repos/{owner}/{repo}/issues/{number}/comments");
    let Some(json) = fetch_github_json(&url) else {
        return String::new();
    };
    let Some(arr) = json.as_array() else {
        return String::new();
    };

    let comments: Vec<CommentData> = arr
        .iter()
        .filter_map(|c| {
            let body = c.get("body")?.as_str()?;
            if body.trim().is_empty() {
                return None;
            }
            let author = c
                .get("user")
                .and_then(|u| u.get("login"))
                .and_then(serde_json::Value::as_str)
                .unwrap_or("Unknown")
                .to_string();
            let date = c
                .get("created_at")
                .and_then(serde_json::Value::as_str)
                .and_then(|d| d.split('T').next())
                .unwrap_or("")
                .to_string();
            Some(CommentData {
                author,
                date,
                content: markdown_to_html(body),
                depth: 0,
                score: None,
                url: None,
            })
        })
        .collect();

    if comments.is_empty() {
        return String::new();
    }
    build_comment_tree(&comments)
}

/// Fetch JSON from a GitHub API endpoint.
fn fetch_github_json(url: &str) -> Option<serde_json::Value> {
    let body = crate::http::get_with_headers(url, &[("Accept", "application/vnd.github+json")])?;
    serde_json::from_str(&body).ok()
}

/// Minimal helper to extract a string field from JSON.
fn json_str(json: &serde_json::Value, key: &str) -> String {
    json.get(key)
        .and_then(serde_json::Value::as_str)
        .unwrap_or("")
        .to_string()
}

/// Convert markdown text to simple HTML.
///
/// Handles paragraphs, inline code, code blocks, bold, italic,
/// links, and headers. Not a full markdown parser — just enough
/// to make API body content readable.
fn markdown_to_html(md: &str) -> String {
    let mut html = String::with_capacity(md.len() * 2);
    let mut in_code_block = false;
    let mut paragraph_lines: Vec<&str> = Vec::new();

    for line in md.lines() {
        if line.starts_with("```") {
            flush_paragraph(&mut html, &mut paragraph_lines);
            in_code_block = !in_code_block;
            if in_code_block {
                html.push_str("<pre><code>");
            } else {
                html.push_str("</code></pre>\n");
            }
            continue;
        }

        if in_code_block {
            let _ = writeln!(html, "{}", dom::html_escape(line));
            continue;
        }

        let trimmed = line.trim();
        if trimmed.is_empty() {
            flush_paragraph(&mut html, &mut paragraph_lines);
            continue;
        }

        if let Some(header) = parse_md_header(trimmed) {
            flush_paragraph(&mut html, &mut paragraph_lines);
            html.push_str(&header);
            continue;
        }

        paragraph_lines.push(trimmed);
    }

    if in_code_block {
        html.push_str("</code></pre>\n");
    }
    flush_paragraph(&mut html, &mut paragraph_lines);
    html
}

fn flush_paragraph(html: &mut String, lines: &mut Vec<&str>) {
    if lines.is_empty() {
        return;
    }
    let text = lines.join(" ");
    let escaped = dom::html_escape(&text);
    let _ = writeln!(html, "<p>{escaped}</p>");
    lines.clear();
}

fn parse_md_header(line: &str) -> Option<String> {
    let level = line.bytes().take_while(|&b| b == b'#').count();
    if !(1..=6).contains(&level) {
        return None;
    }
    let rest = line[level..].trim();
    if rest.is_empty() {
        return None;
    }
    let escaped = dom::html_escape(rest);
    Some(format!("<h{level}>{escaped}</h{level}>\n"))
}

#[cfg(test)]
#[expect(clippy::unwrap_used, clippy::panic)]
mod tests {
    use super::*;

    fn load_fixture(name: &str) -> String {
        let path = format!("{}/tests/fixtures/{name}", env!("CARGO_MANIFEST_DIR"));
        std::fs::read_to_string(&path)
            .unwrap_or_else(|e| panic!("fixture not found at {path}: {e}"))
    }

    fn url_from_fixture(html_str: &str) -> Option<String> {
        let start = html_str.find("<!-- {")?;
        let comment_start = start + "<!-- ".len();
        let end = html_str[comment_start..].find(" -->")?;
        let json_str = &html_str[comment_start..comment_start + end];
        let val: serde_json::Value = serde_json::from_str(json_str).ok()?;
        val.get("url").and_then(|v| v.as_str()).map(String::from)
    }

    #[test]
    fn extract_github_issue() {
        let html_str = load_fixture("general--github.com-issue-56.html");
        let url = url_from_fixture(&html_str);
        let html = Html::parse_document(&html_str);

        assert!(is_github(&html, url.as_deref()));
        let result = extract_github(&html, url.as_deref(), true).unwrap();

        assert!(result.title.as_ref().unwrap().contains("Issue #56"));
        assert!(result.site.as_ref().unwrap().contains("GitHub"));
        // Issue body content (from the issue description)
        assert!(result.content.contains("defuddle-cloudflare"));
        // Should have comments
        assert!(result.content.contains("Comments"));
    }

    #[test]
    fn parse_github_issue_url() {
        let result = parse_github_url("https://github.com/owner/repo/issues/123");
        let (o, r, n, is_pr) = result.unwrap();
        assert_eq!(o, "owner");
        assert_eq!(r, "repo");
        assert_eq!(n, "123");
        assert!(!is_pr);
    }

    #[test]
    fn parse_github_pr_url() {
        let result = parse_github_url("https://github.com/owner/repo/pull/42");
        let (o, r, n, is_pr) = result.unwrap();
        assert_eq!(o, "owner");
        assert_eq!(r, "repo");
        assert_eq!(n, "42");
        assert!(is_pr);
    }

    #[test]
    fn parse_github_url_invalid() {
        assert!(parse_github_url("https://github.com/owner/repo").is_none());
        assert!(parse_github_url("https://example.com").is_none());
    }

    #[test]
    fn markdown_to_html_basic() {
        let md = "Hello **world**\n\nA paragraph.\n\n## Header\n\n```\ncode\n```";
        let html = markdown_to_html(md);
        assert!(html.contains("<p>Hello **world**</p>"));
        assert!(html.contains("<p>A paragraph.</p>"));
        assert!(html.contains("<h2>Header</h2>"));
        assert!(html.contains("<pre><code>"));
        assert!(html.contains("code"));
    }

    #[test]
    fn markdown_to_html_escapes_html_in_text() {
        let md = "Use <script>alert('xss')</script> tag\n\n## <b>Bold</b> header";
        let html = markdown_to_html(md);
        assert!(!html.contains("<script>"));
        assert!(html.contains("&lt;script&gt;"));
        assert!(!html.contains("<b>Bold</b> header</h2>"));
        assert!(html.contains("&lt;b&gt;Bold&lt;/b&gt;"));
    }

    #[test]
    fn api_fetch_live_issue() {
        let url = "https://github.com/rust-lang/rust/issues/1";
        let result = try_api_fetch(Some(url), false);
        if let Some(r) = result {
            assert!(r.title.is_some());
            assert!(r.author.is_some());
            assert_eq!(r.site.as_deref(), Some("GitHub"));
        }
        // Don't fail if network is unavailable
    }

    #[test]
    fn extract_github_pr() {
        let html_str = load_fixture("general--github.com-test-owner-test-repo-pull-42.html");
        let url = url_from_fixture(&html_str);
        let html = Html::parse_document(&html_str);

        assert!(is_github(&html, url.as_deref()));
        let result = extract_github(&html, url.as_deref(), true).unwrap();

        assert!(result.title.unwrap().contains("Pull Request #42"));
        assert_eq!(result.author.as_deref(), Some("author-one"));
        assert_eq!(result.site.as_deref(), Some("GitHub"));
        // PR body should contain the summary
        assert!(result.content.contains("Summary"));
        assert!(result.content.contains("regression"));
        // Should have review comments
        assert!(result.content.contains("Comments"));
        assert!(result.content.contains("reviewer-bot"));
    }
}