use crate::browser::BrowserPool;
use anyhow::Result;
use clap::Args;
use futures::future::join_all;
use scraper::{Html, Selector};
use serde::Serialize;
use std::collections::HashSet;
use std::sync::Arc;
use url::Url;
#[derive(Args)]
pub struct FetchArgs {
#[arg(required = true)]
pub urls: Vec<String>,
#[arg(long, short, default_value = "4")]
pub parallel: usize,
#[arg(long, default_value = "30000")]
pub timeout: u64,
#[arg(long)]
pub raw: bool,
#[arg(long)]
pub cookies: Option<String>,
#[arg(long)]
pub selector: Option<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize)]
#[serde(rename_all = "lowercase")]
pub enum PageStatus {
Ok,
Paywall,
Login,
Dead,
Redirect,
}
#[derive(Debug, Serialize, Clone)]
pub struct Section {
pub level: u8,
pub heading: String,
pub content: String,
}
#[derive(Debug, Serialize, Clone)]
pub struct Link {
pub text: String,
pub url: String,
}
#[derive(Debug, Serialize, Clone)]
pub struct CodeBlock {
#[serde(skip_serializing_if = "Option::is_none")]
pub lang: Option<String>,
pub source: String,
}
#[derive(Debug, Serialize, Clone)]
pub struct Page {
pub url: String,
pub status: PageStatus,
#[serde(skip_serializing_if = "Option::is_none")]
pub title: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub site: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub author: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub date: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub doi: Option<String>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub sections: Vec<Section>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub links: Vec<Link>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub code: Vec<CodeBlock>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub alerts: Vec<String>,
pub chars: usize,
}
pub async fn run_fetch(args: FetchArgs) -> Result<()> {
let url_count = args.urls.len();
let parallel = args.parallel.min(url_count).max(1);
eprintln!(
"Fetching {} URL{} ({} parallel)...",
url_count,
if url_count == 1 { "" } else { "s" },
parallel
);
let pool = Arc::new(BrowserPool::new(parallel).await?);
let timeout = args.timeout;
let raw = args.raw;
let selector: Option<Arc<str>> = args.selector.map(|s| Arc::from(s.as_str()));
let tasks: Vec<_> = args
.urls
.into_iter()
.map(|url| {
let pool = Arc::clone(&pool);
let sel = selector.clone();
tokio::spawn(async move { fetch_one(&pool, &url, timeout, raw, sel.as_deref()).await })
})
.collect();
let results: Vec<Page> = join_all(tasks)
.await
.into_iter()
.filter_map(std::result::Result::ok)
.collect();
if let Ok(pool) = Arc::try_unwrap(pool) {
pool.close().await?;
}
let ok_count = results
.iter()
.filter(|p| p.status == PageStatus::Ok)
.count();
if results.len() == 1 {
println!("{}", serde_json::to_string(&results[0])?);
} else {
for page in &results {
println!("{}", serde_json::to_string(page)?);
}
}
eprintln!("Done: {ok_count}/{url_count} OK");
Ok(())
}
pub(crate) async fn fetch_one(
pool: &BrowserPool,
url: &str,
timeout: u64,
raw: bool,
selector: Option<&str>,
) -> Page {
eprintln!(" -> {}", truncate(url, 60));
let page = match pool.new_page().await {
Ok(p) => p,
Err(e) => return error_page(url, &e.to_string()),
};
let original_host = Url::parse(url)
.ok()
.and_then(|u| u.host_str().map(String::from));
let nav = match page.goto(url, timeout).await {
Ok(n) => n,
Err(e) => return error_page(url, &e.to_string()),
};
if nav.error.is_some() {
return Page {
url: url.to_string(),
status: PageStatus::Dead,
title: nav.title,
site: None,
author: None,
date: None,
doi: None,
sections: vec![],
links: vec![],
code: vec![],
alerts: vec![nav.error.unwrap_or_default()],
chars: 0,
};
}
let final_url = page.current_url().await;
if let (Some(orig), Some(final_u)) = (&original_host, &final_url) {
if let Ok(parsed) = Url::parse(final_u) {
if let Some(final_host) = parsed.host_str() {
let orig_norm = orig.trim_start_matches("www.");
let final_norm = final_host.trim_start_matches("www.");
if orig_norm != final_norm {
return Page {
url: url.to_string(),
status: PageStatus::Redirect,
title: nav.title,
site: None,
author: None,
date: None,
doi: None,
sections: vec![],
links: vec![],
code: vec![],
alerts: vec![format!("Redirected to: {}", final_u)],
chars: 0,
};
}
}
}
}
let html = match page.content().await {
Ok(h) => h,
Err(e) => return error_page(url, &e.to_string()),
};
parse_page(&html, url, raw, selector)
}
fn error_page(url: &str, error: &str) -> Page {
Page {
url: url.to_string(),
status: PageStatus::Dead,
title: None,
site: None,
author: None,
date: None,
doi: None,
sections: vec![],
links: vec![],
code: vec![],
alerts: vec![error.to_string()],
chars: 0,
}
}
pub(crate) fn parse_page(html: &str, url: &str, raw: bool, selector: Option<&str>) -> Page {
let doc = Html::parse_document(html);
let mut alerts = Vec::new();
let title = extract_title(&doc);
let site = extract_meta(&doc, "og:site_name");
let author = extract_meta(&doc, "author").or_else(|| extract_meta(&doc, "article:author"));
let date = extract_meta(&doc, "article:published_time")
.or_else(|| extract_meta(&doc, "date"))
.or_else(|| extract_meta(&doc, "pubdate"));
let doi = extract_doi(&doc);
let status = detect_status(html);
if status == PageStatus::Paywall {
alerts.push("Paywall detected".to_string());
} else if status == PageStatus::Login {
alerts.push("Login required".to_string());
}
let content_html = if let Some(sel_str) = selector {
if let Ok(sel) = Selector::parse(sel_str) {
if let Some(el) = doc.select(&sel).next() {
el.html()
} else {
extract_main_content(&doc) }
} else {
extract_main_content(&doc) }
} else if raw {
html.to_string()
} else {
extract_main_content(&doc)
};
let content_doc = Html::parse_document(&content_html);
let sections = extract_sections(&content_doc);
let links = if raw {
extract_all_links(&doc, url)
} else {
extract_content_links(&content_doc, url)
};
let code = extract_code_blocks(&content_doc);
let chars: usize = sections
.iter()
.map(|s| s.content.len() + s.heading.len())
.sum();
Page {
url: url.to_string(),
status,
title,
site,
author,
date,
doi,
sections,
links,
code,
alerts,
chars,
}
}
fn extract_title(doc: &Html) -> Option<String> {
select_text(doc, "title")
.or_else(|| select_attr(doc, "meta[property='og:title']", "content"))
.or_else(|| select_text(doc, "h1"))
}
fn extract_meta(doc: &Html, name: &str) -> Option<String> {
select_attr(doc, &format!("meta[property='{name}']"), "content")
.or_else(|| select_attr(doc, &format!("meta[name='{name}']"), "content"))
}
fn extract_doi(doc: &Html) -> Option<String> {
if let Some(doi) = select_attr(doc, "meta[name='citation_doi']", "content") {
return Some(doi);
}
if let Some(doi) = select_attr(doc, "meta[name='DC.identifier']", "content") {
if doi.contains("doi.org") || doi.starts_with("10.") {
return Some(doi);
}
}
if let Ok(sel) = Selector::parse("a[href*='doi.org']") {
if let Some(el) = doc.select(&sel).next() {
if let Some(href) = el.value().attr("href") {
return Some(href.to_string());
}
}
}
None
}
fn detect_status(html: &str) -> PageStatus {
let lower = html.to_lowercase();
let paywall_patterns = [
"subscribe to continue",
"subscription required",
"premium content",
"paywall",
"member-only",
"members only",
"unlock this article",
"purchase to read",
];
for pattern in paywall_patterns {
if lower.contains(pattern) {
return PageStatus::Paywall;
}
}
let login_patterns = [
"sign in to continue",
"log in to continue",
"login to continue",
"please sign in",
"please log in",
"create an account to",
"sign up to view",
];
for pattern in login_patterns {
if lower.contains(pattern) {
return PageStatus::Login;
}
}
PageStatus::Ok
}
fn extract_main_content(doc: &Html) -> String {
let selectors = [
"main",
"article",
"[role='main']",
".post-content",
".article-content",
".entry-content",
"#content",
".content",
];
for sel_str in selectors {
if let Ok(sel) = Selector::parse(sel_str) {
if let Some(el) = doc.select(&sel).next() {
return el.html();
}
}
}
if let Ok(body_sel) = Selector::parse("body") {
if let Some(body) = doc.select(&body_sel).next() {
let html = body.html();
let strip_patterns = [
"<nav",
"</nav>",
"<header",
"</header>",
"<footer",
"</footer>",
"<aside",
"</aside>",
"cookie",
"Cookie",
];
for pattern in strip_patterns {
if html.contains(pattern) {
}
}
return html;
}
}
doc.html()
}
fn extract_sections(doc: &Html) -> Vec<Section> {
let mut sections = Vec::new();
let mut current_section: Option<Section> = None;
let Ok(sel) = Selector::parse("h1, h2, h3, h4, h5, h6, p") else {
return sections;
};
for el in doc.select(&sel) {
let tag = el.value().name();
let text: String = el.text().collect::<String>().trim().to_string();
if text.is_empty() || text.len() < 3 {
continue;
}
if tag.starts_with('h') && tag.len() == 2 {
if let Some(s) = current_section.take() {
if !s.content.is_empty() {
sections.push(s);
}
}
let level = tag.chars().nth(1).and_then(|c| c.to_digit(10)).unwrap_or(1);
let level = u8::try_from(level).unwrap_or(1);
current_section = Some(Section {
level,
heading: truncate_section(&text, 200),
content: String::new(),
});
} else if tag == "p" {
if let Some(ref mut s) = current_section {
if !s.content.is_empty() {
s.content.push_str("\n\n");
}
s.content.push_str(&truncate_section(&text, 2000));
}
}
}
if let Some(s) = current_section {
if !s.content.is_empty() {
sections.push(s);
}
}
if sections.is_empty() {
let text = html2text::from_read(doc.html().as_bytes(), 120)
.unwrap_or_default()
.lines()
.filter(|l| !l.trim().is_empty())
.take(100)
.collect::<Vec<_>>()
.join("\n");
if !text.is_empty() {
sections.push(Section {
level: 1,
heading: "Content".to_string(),
content: truncate_section(&text, 10000),
});
}
}
sections.truncate(50);
sections
}
fn extract_content_links(doc: &Html, base_url: &str) -> Vec<Link> {
let mut links = Vec::new();
let mut seen = HashSet::new();
let base = Url::parse(base_url).ok();
if let Ok(sel) = Selector::parse("a[href]") {
for el in doc.select(&sel) {
let text: String = el.text().collect::<String>().trim().to_string();
let href = el.value().attr("href").unwrap_or("");
if href.is_empty() || href.starts_with('#') || href.starts_with("javascript:") {
continue;
}
if text.len() < 3 {
continue;
}
let full_url = if let Some(ref base) = base {
base.join(href)
.map_or_else(|_| href.to_string(), |u| u.to_string())
} else {
href.to_string()
};
if seen.contains(&full_url) {
continue;
}
seen.insert(full_url.clone());
links.push(Link {
text: truncate(&text, 100),
url: full_url,
});
if links.len() >= 50 {
break;
}
}
}
links
}
fn extract_all_links(doc: &Html, base_url: &str) -> Vec<Link> {
extract_content_links(doc, base_url)
}
fn extract_code_blocks(doc: &Html) -> Vec<CodeBlock> {
let mut blocks = Vec::new();
let mut seen_sources = HashSet::new();
for selector_str in ["pre code", "pre"] {
let Ok(sel) = Selector::parse(selector_str) else {
continue;
};
for el in doc.select(&sel) {
let source: String = el.text().collect::<String>().trim().to_string();
if source.is_empty() || source.len() < 10 {
continue;
}
if seen_sources.contains(&source) {
continue;
}
seen_sources.insert(source.clone());
let lang = el.value().attr("class").and_then(|c| {
c.split_whitespace()
.find(|cls| cls.starts_with("language-") || cls.starts_with("lang-"))
.map(|cls| {
cls.trim_start_matches("language-")
.trim_start_matches("lang-")
.to_string()
})
});
blocks.push(CodeBlock {
lang,
source: truncate_section(&source, 5000),
});
if blocks.len() >= 20 {
break;
}
}
}
blocks
}
fn select_text(doc: &Html, sel: &str) -> Option<String> {
let selector = Selector::parse(sel).ok()?;
doc.select(&selector)
.next()
.map(|el| el.text().collect::<String>().trim().to_string())
.filter(|s| !s.is_empty())
}
fn select_attr(doc: &Html, sel: &str, attr: &str) -> Option<String> {
let selector = Selector::parse(sel).ok()?;
doc.select(&selector)
.next()
.and_then(|el| el.value().attr(attr))
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
}
fn floor_char_boundary(s: &str, pos: usize) -> usize {
if pos >= s.len() {
return s.len();
}
let mut i = pos;
while i > 0 && !s.is_char_boundary(i) {
i -= 1;
}
i
}
fn truncate(s: &str, max: usize) -> String {
if s.len() <= max {
s.to_string()
} else {
let end = floor_char_boundary(s, max - 3);
format!("{}...", &s[..end])
}
}
fn truncate_section(s: &str, max: usize) -> String {
if s.len() <= max {
s.to_string()
} else {
let end = floor_char_boundary(s, max);
let truncated = &s[..end];
if let Some(last_space) = truncated.rfind(' ') {
if last_space > end.saturating_sub(50) {
return format!("{}...", &s[..last_space]);
}
}
format!("{truncated}...")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_basic() {
let html = r"
<html>
<head><title>Test Page</title></head>
<body>
<main>
<h1>Main Title</h1>
<p>Some content here with enough text to be meaningful.</p>
</main>
</body>
</html>
";
let page = parse_page(html, "https://test.com", false, None);
assert_eq!(page.status, PageStatus::Ok);
assert_eq!(page.title, Some("Test Page".to_string()));
assert!(!page.sections.is_empty());
}
#[test]
fn test_detect_paywall() {
assert_eq!(
detect_status("<div>Subscribe to continue reading</div>"),
PageStatus::Paywall
);
assert_eq!(detect_status("<div>Normal content</div>"), PageStatus::Ok);
}
#[test]
fn test_detect_login() {
assert_eq!(
detect_status("<div>Please sign in to continue</div>"),
PageStatus::Login
);
}
#[test]
fn test_truncate() {
assert_eq!(truncate("short", 10), "short");
assert_eq!(truncate("this is a long string", 10), "this is...");
let dashes = "─".repeat(100);
let result = truncate(&dashes, 20);
assert!(result.ends_with("..."));
assert!(result.len() <= 23); }
#[test]
fn test_floor_char_boundary() {
let s = "hello─world"; assert_eq!(floor_char_boundary(s, 5), 5); assert_eq!(floor_char_boundary(s, 6), 5); assert_eq!(floor_char_boundary(s, 7), 5); assert_eq!(floor_char_boundary(s, 8), 8); }
#[test]
fn test_extract_sections_document_order() {
let html = r#"
<html><body>
<h1>Alpha</h1>
<p>Content under alpha, long enough to be meaningful here.</p>
<h2>Beta</h2>
<p>Content under beta, also long enough to be meaningful.</p>
</body></html>
"#;
let page = parse_page(html, "https://test.com", false, None);
assert!(!page.sections.is_empty(), "should have sections");
let alpha = page.sections.iter().find(|s| s.heading == "Alpha");
assert!(alpha.is_some(), "should have Alpha section");
assert!(
alpha.unwrap().content.contains("Content under alpha"),
"Alpha section should contain its own paragraph, got: {:?}",
alpha.unwrap().content
);
let beta = page.sections.iter().find(|s| s.heading == "Beta");
assert!(beta.is_some(), "should have Beta section");
assert!(
beta.unwrap().content.contains("Content under beta"),
"Beta section should contain its own paragraph, got: {:?}",
beta.unwrap().content
);
}
#[test]
fn test_extract_code() {
let html =
r#"<pre><code class="language-rust">fn main() { println!("hello"); }</code></pre>"#;
let doc = Html::parse_document(html);
let blocks = extract_code_blocks(&doc);
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].lang, Some("rust".to_string()));
}
#[test]
fn test_parse_page_with_selector() {
let html = r#"
<html><body>
<nav><a href="/">Home</a></nav>
<div id="target">
<h1>Targeted Heading</h1>
<p>This is the targeted content, long enough to be meaningful.</p>
</div>
<footer>Footer noise</footer>
</body></html>
"#;
let page = parse_page(html, "https://test.com", false, Some("#target"));
assert!(!page.sections.is_empty());
let has_targeted = page
.sections
.iter()
.any(|s| s.heading.contains("Targeted") || s.content.contains("targeted content"));
assert!(
has_targeted,
"Should extract targeted element content, got: {:?}",
page.sections
);
}
#[test]
fn test_parse_page_selector_fallback() {
let html = r#"
<html><body>
<h1>Normal Heading</h1>
<p>Normal content that is long enough to be meaningful here.</p>
</body></html>
"#;
let page = parse_page(html, "https://test.com", false, Some(".nonexistent-class"));
assert!(
!page.sections.is_empty(),
"Should fall back to normal extraction"
);
}
}