use std::fmt::Write;
use scraper::{Html, Selector};
use crate::dom;
use super::ExtractorResult;
use super::comments::{CommentData, build_comment_tree, build_content_html};
#[must_use]
pub fn is_github(html: &Html, url: Option<&str>) -> bool {
let has_meta = has_github_meta(html);
let is_issue = url.is_some_and(is_issue_url) || has_issue_markers(html);
let is_pr = url.is_some_and(is_pr_url) || has_pr_markers(html);
(has_meta || url.is_some_and(|u| u.contains("github.com/"))) && (is_issue || is_pr)
}
#[must_use]
pub fn extract_github(
html: &Html,
url: Option<&str>,
include_replies: bool,
) -> Option<ExtractorResult> {
if !is_github(html, url) {
return None;
}
let is_pr = url.is_some_and(is_pr_url) || has_pr_markers(html);
let result = if is_pr {
extract_pr(html, url, include_replies)
} else {
extract_issue(html, url, include_replies)
};
if dom::count_words_html(&result.content) < 10
&& let Some(api_result) = try_api_fetch(url, include_replies)
{
return Some(api_result);
}
Some(result)
}
fn has_github_meta(html: &Html) -> bool {
let selectors = [
"meta[name=\"expected-hostname\"][content=\"github.com\"]",
"meta[name=\"octolytics-url\"]",
"meta[name=\"github-keyboard-shortcuts\"]",
];
selectors.iter().any(|s| {
Selector::parse(s)
.ok()
.is_some_and(|sel| html.select(&sel).next().is_some())
})
}
fn is_issue_url(url: &str) -> bool {
url.contains("/issues/")
}
fn is_pr_url(url: &str) -> bool {
url.contains("/pull/")
}
fn has_issue_markers(html: &Html) -> bool {
let selectors = [
"[data-testid=\"issue-metadata-sticky\"]",
"[data-testid=\"issue-title\"]",
];
selectors.iter().any(|s| {
Selector::parse(s)
.ok()
.is_some_and(|sel| html.select(&sel).next().is_some())
})
}
fn has_pr_markers(html: &Html) -> bool {
let selectors = [
".pull-discussion-timeline",
".discussion-timeline",
".gh-header-title",
];
selectors.iter().any(|s| {
Selector::parse(s)
.ok()
.is_some_and(|sel| html.select(&sel).next().is_some())
})
}
fn extract_repo_info(url: Option<&str>) -> (String, String) {
use std::sync::LazyLock;
static REPO_RE: LazyLock<regex::Regex> = LazyLock::new(|| {
regex::Regex::new(r"github\.com/([^/]+)/([^/]+)").expect("github repo regex is valid")
});
let Some(u) = url else {
return (String::new(), String::new());
};
REPO_RE
.captures(u)
.map(|caps| {
(
caps.get(1)
.map_or(String::new(), |m| m.as_str().to_string()),
caps.get(2)
.map_or(String::new(), |m| m.as_str().to_string()),
)
})
.unwrap_or_default()
}
fn extract_issue(html: &Html, url: Option<&str>, include_replies: bool) -> ExtractorResult {
let (_owner, _repo) = extract_repo_info(url);
let title = extract_title(html);
let (body, author) = extract_issue_body(html);
let comments = if include_replies {
extract_issue_comments(html)
} else {
String::new()
};
let content = build_content_html("github", &body, &comments);
ExtractorResult {
content,
title: Some(title),
author: if author.is_empty() {
None
} else {
Some(author)
},
site: Some("GitHub".to_string()),
published: None,
image: None,
description: None,
}
}
fn extract_title(html: &Html) -> String {
let sel = Selector::parse("title").ok();
let raw = sel
.and_then(|s| html.select(&s).next())
.map(|el| dom::text_content(html, el.id()).trim().to_string())
.unwrap_or_default();
if let Some(idx) = raw.rfind(" · ") {
let after = &raw[idx + " · ".len()..];
if after.contains('/') {
return raw[..idx].to_string();
}
}
raw
}
fn extract_issue_body(html: &Html) -> (String, String) {
let container_sel = "[data-testid=\"issue-viewer-issue-container\"]";
let container_ids = dom::select_ids(html, container_sel);
let Some(&container_id) = container_ids.first() else {
return (String::new(), String::new());
};
let author = extract_issue_author(html, container_id);
let body_sel = "[data-testid=\"issue-body-viewer\"] [data-testid=\"markdown-body\"]";
let body_ids = dom::select_ids(html, body_sel);
let body = body_ids
.first()
.map(|&id| dom::inner_html(html, id))
.unwrap_or_default();
(body.trim().to_string(), author)
}
fn extract_issue_author(html: &Html, container_id: ego_tree::NodeId) -> String {
let author_selectors = [
"[data-testid=\"issue-body-header-author\"]",
"a[data-testid=\"avatar-link\"]",
];
for sel_str in &author_selectors {
let ids = dom::select_within(html, container_id, sel_str);
if let Some(&id) = ids.first() {
let text = dom::text_content(html, id);
let trimmed = text.trim();
if !trimmed.is_empty() {
return trimmed.to_string();
}
if let Some(href) = dom::get_attr(html, id, "href") {
let name = href
.strip_prefix("https://github.com/")
.or_else(|| href.strip_prefix('/'))
.unwrap_or(&href);
if !name.is_empty() {
return name.to_string();
}
}
}
}
String::new()
}
fn extract_issue_comments(html: &Html) -> String {
let timeline_sel = "[data-wrapper-timeline-id]";
let timeline_ids = dom::select_ids(html, timeline_sel);
let mut comments = Vec::new();
for &timeline_id in &timeline_ids {
if let Some(comment) = extract_single_issue_comment(html, timeline_id) {
comments.push(comment);
}
}
if comments.is_empty() {
return String::new();
}
build_comment_tree(&comments)
}
fn extract_single_issue_comment(html: &Html, timeline_id: ego_tree::NodeId) -> Option<CommentData> {
let react_ids = dom::select_within(html, timeline_id, ".react-issue-comment");
let comment_container = react_ids.first().copied().unwrap_or(timeline_id);
let author = extract_comment_author(html, comment_container);
let date = extract_relative_time(html, comment_container);
let body_ids = dom::select_within(html, comment_container, "[data-testid=\"markdown-body\"]");
let body = body_ids
.first()
.map(|&id| dom::inner_html(html, id).trim().to_string())
.unwrap_or_default();
if body.is_empty() {
return None;
}
Some(CommentData {
author,
date,
content: body,
depth: 0,
score: None,
url: None,
})
}
fn extract_comment_author(html: &Html, container_id: ego_tree::NodeId) -> String {
let selectors = [
"[data-testid=\"avatar-link\"]",
"a[href^=\"/\"][data-hovercard-url]",
];
for sel_str in &selectors {
let ids = dom::select_within(html, container_id, sel_str);
if let Some(&id) = ids.first()
&& let Some(href) = dom::get_attr(html, id, "href")
{
let name = href.strip_prefix('/').unwrap_or(&href);
if !name.is_empty() {
return name.to_string();
}
}
}
"Unknown".to_string()
}
fn extract_relative_time(html: &Html, container_id: ego_tree::NodeId) -> String {
let ids = dom::select_within(html, container_id, "relative-time");
ids.first()
.and_then(|&id| dom::get_attr(html, id, "datetime"))
.and_then(|dt| dt.split('T').next().map(String::from))
.unwrap_or_default()
}
fn extract_pr(html: &Html, url: Option<&str>, include_replies: bool) -> ExtractorResult {
let (_owner, _repo) = extract_repo_info(url);
let title = extract_title(html);
let (body, author) = extract_pr_body(html);
let comments = if include_replies {
extract_pr_comments(html)
} else {
String::new()
};
let content = build_content_html("github", &body, &comments);
ExtractorResult {
content,
title: Some(title),
author: if author.is_empty() {
None
} else {
Some(author)
},
site: Some("GitHub".to_string()),
published: None,
image: None,
description: None,
}
}
fn extract_pr_body(html: &Html) -> (String, String) {
let pr_sel = "[id^=\"pullrequest-\"]";
let pr_ids = dom::select_ids(html, pr_sel);
let pr_container = pr_ids.first().copied();
let body_sel = ".comment-body.markdown-body";
let body = if let Some(container_id) = pr_container {
let ids = dom::select_within(html, container_id, body_sel);
ids.first()
.map(|&id| dom::inner_html(html, id).trim().to_string())
} else {
let ids = dom::select_ids(html, body_sel);
ids.first()
.map(|&id| dom::inner_html(html, id).trim().to_string())
}
.unwrap_or_default();
let author = pr_container
.map(|cid| extract_pr_author(html, cid))
.unwrap_or_default();
(body, author)
}
fn extract_pr_author(html: &Html, container_id: ego_tree::NodeId) -> String {
let ids = dom::select_within(html, container_id, ".author");
ids.first()
.map(|&id| dom::text_content(html, id).trim().to_string())
.unwrap_or_default()
}
fn extract_pr_comments(html: &Html) -> String {
let comment_sel = ".timeline-comment, .review-comment";
let all_ids = dom::select_ids(html, comment_sel);
let pr_body_ids = dom::select_ids(html, "[id^=\"pullrequest-\"]");
let pr_body_id = pr_body_ids.first().copied();
let mut comments = Vec::new();
for &cid in &all_ids {
if pr_body_id.is_some_and(|pb| pb == cid || dom::is_ancestor(html, cid, pb)) {
continue;
}
if let Some(comment) = extract_single_pr_comment(html, cid) {
comments.push(comment);
}
}
if comments.is_empty() {
return String::new();
}
build_comment_tree(&comments)
}
fn extract_single_pr_comment(html: &Html, comment_id: ego_tree::NodeId) -> Option<CommentData> {
let author = extract_pr_author(html, comment_id);
let date = extract_relative_time(html, comment_id);
let body_ids = dom::select_within(html, comment_id, ".comment-body.markdown-body");
let body = body_ids
.first()
.map(|&id| dom::inner_html(html, id).trim().to_string())
.unwrap_or_default();
if body.is_empty() {
return None;
}
Some(CommentData {
author,
date,
content: body,
depth: 0,
score: None,
url: None,
})
}
fn parse_github_url(url: &str) -> Option<(String, String, String, bool)> {
use std::sync::LazyLock;
static RE: LazyLock<regex::Regex> = LazyLock::new(|| {
regex::Regex::new(r"github\.com/([^/]+)/([^/]+)/(issues|pull)/(\d+)")
.expect("github url regex is valid")
});
let caps = RE.captures(url)?;
let owner = caps.get(1)?.as_str().to_string();
let repo = caps.get(2)?.as_str().to_string();
let kind = caps.get(3)?.as_str();
let number = caps.get(4)?.as_str().to_string();
Some((owner, repo, number, kind == "pull"))
}
fn try_api_fetch(url: Option<&str>, include_replies: bool) -> Option<ExtractorResult> {
let (owner, repo, number, is_pr) = parse_github_url(url?)?;
let endpoint = if is_pr { "pulls" } else { "issues" };
let api_url = format!("https://api.github.com/repos/{owner}/{repo}/{endpoint}/{number}");
let json = fetch_github_json(&api_url)?;
let title = json_str(&json, "title");
let body = json_str(&json, "body");
let author = json
.get("user")
.and_then(|u| u.get("login"))
.and_then(serde_json::Value::as_str)
.unwrap_or("")
.to_string();
let published = json_str(&json, "created_at")
.split('T')
.next()
.unwrap_or("")
.to_string();
let body_html = markdown_to_html(&body);
let comments_html = if include_replies {
fetch_api_comments(&owner, &repo, &number)
} else {
String::new()
};
let content = build_content_html("github", &body_html, &comments_html);
Some(ExtractorResult {
content,
title: if title.is_empty() { None } else { Some(title) },
author: if author.is_empty() {
None
} else {
Some(author)
},
site: Some("GitHub".to_string()),
published: if published.is_empty() {
None
} else {
Some(published)
},
image: None,
description: None,
})
}
fn fetch_api_comments(owner: &str, repo: &str, number: &str) -> String {
let url = format!("https://api.github.com/repos/{owner}/{repo}/issues/{number}/comments");
let Some(json) = fetch_github_json(&url) else {
return String::new();
};
let Some(arr) = json.as_array() else {
return String::new();
};
let comments: Vec<CommentData> = arr
.iter()
.filter_map(|c| {
let body = c.get("body")?.as_str()?;
if body.trim().is_empty() {
return None;
}
let author = c
.get("user")
.and_then(|u| u.get("login"))
.and_then(serde_json::Value::as_str)
.unwrap_or("Unknown")
.to_string();
let date = c
.get("created_at")
.and_then(serde_json::Value::as_str)
.and_then(|d| d.split('T').next())
.unwrap_or("")
.to_string();
Some(CommentData {
author,
date,
content: markdown_to_html(body),
depth: 0,
score: None,
url: None,
})
})
.collect();
if comments.is_empty() {
return String::new();
}
build_comment_tree(&comments)
}
fn fetch_github_json(url: &str) -> Option<serde_json::Value> {
let body = crate::http::get_with_headers(url, &[("Accept", "application/vnd.github+json")])?;
serde_json::from_str(&body).ok()
}
fn json_str(json: &serde_json::Value, key: &str) -> String {
json.get(key)
.and_then(serde_json::Value::as_str)
.unwrap_or("")
.to_string()
}
fn markdown_to_html(md: &str) -> String {
let mut html = String::with_capacity(md.len() * 2);
let mut in_code_block = false;
let mut paragraph_lines: Vec<&str> = Vec::new();
for line in md.lines() {
if line.starts_with("```") {
flush_paragraph(&mut html, &mut paragraph_lines);
in_code_block = !in_code_block;
if in_code_block {
html.push_str("<pre><code>");
} else {
html.push_str("</code></pre>\n");
}
continue;
}
if in_code_block {
let _ = writeln!(html, "{}", dom::html_escape(line));
continue;
}
let trimmed = line.trim();
if trimmed.is_empty() {
flush_paragraph(&mut html, &mut paragraph_lines);
continue;
}
if let Some(header) = parse_md_header(trimmed) {
flush_paragraph(&mut html, &mut paragraph_lines);
html.push_str(&header);
continue;
}
paragraph_lines.push(trimmed);
}
if in_code_block {
html.push_str("</code></pre>\n");
}
flush_paragraph(&mut html, &mut paragraph_lines);
html
}
fn flush_paragraph(html: &mut String, lines: &mut Vec<&str>) {
if lines.is_empty() {
return;
}
let text = lines.join(" ");
let escaped = dom::html_escape(&text);
let _ = writeln!(html, "<p>{escaped}</p>");
lines.clear();
}
fn parse_md_header(line: &str) -> Option<String> {
let level = line.bytes().take_while(|&b| b == b'#').count();
if !(1..=6).contains(&level) {
return None;
}
let rest = line[level..].trim();
if rest.is_empty() {
return None;
}
let escaped = dom::html_escape(rest);
Some(format!("<h{level}>{escaped}</h{level}>\n"))
}
#[cfg(test)]
#[expect(clippy::unwrap_used, clippy::panic)]
mod tests {
use super::*;
fn load_fixture(name: &str) -> String {
let path = format!("{}/tests/fixtures/{name}", env!("CARGO_MANIFEST_DIR"));
std::fs::read_to_string(&path)
.unwrap_or_else(|e| panic!("fixture not found at {path}: {e}"))
}
fn url_from_fixture(html_str: &str) -> Option<String> {
let start = html_str.find("<!-- {")?;
let comment_start = start + "<!-- ".len();
let end = html_str[comment_start..].find(" -->")?;
let json_str = &html_str[comment_start..comment_start + end];
let val: serde_json::Value = serde_json::from_str(json_str).ok()?;
val.get("url").and_then(|v| v.as_str()).map(String::from)
}
#[test]
fn extract_github_issue() {
let html_str = load_fixture("general--github.com-issue-56.html");
let url = url_from_fixture(&html_str);
let html = Html::parse_document(&html_str);
assert!(is_github(&html, url.as_deref()));
let result = extract_github(&html, url.as_deref(), true).unwrap();
assert!(result.title.as_ref().unwrap().contains("Issue #56"));
assert!(result.site.as_ref().unwrap().contains("GitHub"));
assert!(result.content.contains("defuddle-cloudflare"));
assert!(result.content.contains("Comments"));
}
#[test]
fn parse_github_issue_url() {
let result = parse_github_url("https://github.com/owner/repo/issues/123");
let (o, r, n, is_pr) = result.unwrap();
assert_eq!(o, "owner");
assert_eq!(r, "repo");
assert_eq!(n, "123");
assert!(!is_pr);
}
#[test]
fn parse_github_pr_url() {
let result = parse_github_url("https://github.com/owner/repo/pull/42");
let (o, r, n, is_pr) = result.unwrap();
assert_eq!(o, "owner");
assert_eq!(r, "repo");
assert_eq!(n, "42");
assert!(is_pr);
}
#[test]
fn parse_github_url_invalid() {
assert!(parse_github_url("https://github.com/owner/repo").is_none());
assert!(parse_github_url("https://example.com").is_none());
}
#[test]
fn markdown_to_html_basic() {
let md = "Hello **world**\n\nA paragraph.\n\n## Header\n\n```\ncode\n```";
let html = markdown_to_html(md);
assert!(html.contains("<p>Hello **world**</p>"));
assert!(html.contains("<p>A paragraph.</p>"));
assert!(html.contains("<h2>Header</h2>"));
assert!(html.contains("<pre><code>"));
assert!(html.contains("code"));
}
#[test]
fn markdown_to_html_escapes_html_in_text() {
let md = "Use <script>alert('xss')</script> tag\n\n## <b>Bold</b> header";
let html = markdown_to_html(md);
assert!(!html.contains("<script>"));
assert!(html.contains("<script>"));
assert!(!html.contains("<b>Bold</b> header</h2>"));
assert!(html.contains("<b>Bold</b>"));
}
#[test]
fn api_fetch_live_issue() {
let url = "https://github.com/rust-lang/rust/issues/1";
let result = try_api_fetch(Some(url), false);
if let Some(r) = result {
assert!(r.title.is_some());
assert!(r.author.is_some());
assert_eq!(r.site.as_deref(), Some("GitHub"));
}
}
#[test]
fn extract_github_pr() {
let html_str = load_fixture("general--github.com-test-owner-test-repo-pull-42.html");
let url = url_from_fixture(&html_str);
let html = Html::parse_document(&html_str);
assert!(is_github(&html, url.as_deref()));
let result = extract_github(&html, url.as_deref(), true).unwrap();
assert!(result.title.unwrap().contains("Pull Request #42"));
assert_eq!(result.author.as_deref(), Some("author-one"));
assert_eq!(result.site.as_deref(), Some("GitHub"));
assert!(result.content.contains("Summary"));
assert!(result.content.contains("regression"));
assert!(result.content.contains("Comments"));
assert!(result.content.contains("reviewer-bot"));
}
}