use crate::browser::BrowserPool;
use crate::schema::{ReferencesFile, Status};
use anyhow::{Context, Result};
use chrono::Utc;
use clap::Args;
use futures::future::join_all;
use scraper::{Html, Selector};
use serde::Serialize;
use std::path::PathBuf;
use std::sync::Arc;
use tokio::sync::Mutex;
use url::Url;
#[derive(Args)]
pub struct VerifyRefsArgs {
pub file: PathBuf,
#[arg(long, short, default_value = "4")]
pub parallel: usize,
#[arg(long, short)]
pub category: Option<Vec<String>>,
#[arg(long, default_value = "30000")]
pub timeout: u64,
#[arg(long)]
pub dry_run: bool,
}
#[derive(Debug, Serialize)]
pub struct VerifySummary {
pub total: usize,
pub verified: usize,
pub ok: usize,
pub dead: usize,
pub redirect: usize,
pub paywall: usize,
pub login: usize,
pub skipped: usize,
}
#[derive(Debug, Serialize)]
pub struct VerifyOutput {
pub summary: VerifySummary,
pub file: String,
pub timestamp: String,
}
pub async fn run_verify_refs(args: VerifyRefsArgs) -> Result<()> {
let output = verify_refs_core(args).await?;
println!("{}", serde_json::to_string(&output)?);
Ok(())
}
pub(crate) async fn verify_refs_core(args: VerifyRefsArgs) -> Result<VerifyOutput> {
let content = tokio::fs::read_to_string(&args.file)
.await
.with_context(|| format!("Failed to read {}", args.file.display()))?;
let refs_file: ReferencesFile =
serde_yaml::from_str(&content).context("Failed to parse references.yaml")?;
let total = refs_file.references.len();
eprintln!("Loaded {} references from {}", total, args.file.display());
let indices_to_verify: Vec<usize> = refs_file
.references
.iter()
.enumerate()
.filter(|(_, r)| {
if let Some(cats) = &args.category {
r.categories.iter().any(|c| cats.contains(c))
} else {
true
}
})
.map(|(i, _)| i)
.collect();
let to_verify = indices_to_verify.len();
let skipped = total - to_verify;
if to_verify == 0 {
eprintln!("No references to verify (all filtered out)");
return Ok(VerifyOutput {
summary: compute_summary(total, skipped, &[]),
file: args.file.display().to_string(),
timestamp: Utc::now().to_rfc3339(),
});
}
eprintln!(
"Verifying {} references ({} parallel)...",
to_verify, args.parallel
);
let pool = Arc::new(BrowserPool::new(args.parallel).await?);
let timeout = args.timeout;
let refs_file = Arc::new(Mutex::new(refs_file));
let tasks: Vec<_> = indices_to_verify
.into_iter()
.map(|idx| {
let pool = Arc::clone(&pool);
let refs_file = Arc::clone(&refs_file);
tokio::spawn(async move {
let url = {
let file = refs_file.lock().await;
file.references[idx].url.clone()
};
eprintln!(" -> {}", truncate(&url, 60));
let result = verify_url(&pool, &url, timeout).await;
{
let mut file = refs_file.lock().await;
file.references[idx].status = result.status;
file.references[idx].verified = Some(Utc::now().to_rfc3339());
file.references[idx].notes = result.notes;
}
result.status
})
})
.collect();
let statuses: Vec<Status> = join_all(tasks)
.await
.into_iter()
.filter_map(std::result::Result::ok)
.collect();
if let Ok(pool) = Arc::try_unwrap(pool) {
pool.close().await?;
}
let summary = compute_summary(total, skipped, &statuses);
{
let mut file = refs_file.lock().await;
file.meta.last_verified = Some(Utc::now().to_rfc3339());
file.meta.total_links = file.references.len();
}
if args.dry_run {
eprintln!("Dry run - file not modified");
} else {
let file = refs_file.lock().await;
let yaml = serde_yaml::to_string(&*file)?;
tokio::fs::write(&args.file, yaml)
.await
.with_context(|| format!("Failed to write {}", args.file.display()))?;
eprintln!("Updated {}", args.file.display());
}
Ok(VerifyOutput {
summary,
file: args.file.display().to_string(),
timestamp: Utc::now().to_rfc3339(),
})
}
fn compute_summary(total: usize, skipped: usize, statuses: &[Status]) -> VerifySummary {
let mut summary = VerifySummary {
total,
verified: statuses.len(),
ok: 0,
dead: 0,
redirect: 0,
paywall: 0,
login: 0,
skipped,
};
for status in statuses {
match status {
Status::Ok => summary.ok += 1,
Status::Dead => summary.dead += 1,
Status::Redirect => summary.redirect += 1,
Status::Paywall => summary.paywall += 1,
Status::Login => summary.login += 1,
Status::Pending => {}
}
}
summary
}
struct VerifyResult {
status: Status,
notes: Option<String>,
}
async fn verify_url(pool: &BrowserPool, url: &str, timeout: u64) -> VerifyResult {
let page = match pool.new_page().await {
Ok(p) => p,
Err(e) => {
return VerifyResult {
status: Status::Dead,
notes: Some(format!("Browser error: {e}")),
}
}
};
let original_host = match Url::parse(url) {
Ok(u) => u.host_str().map(std::string::ToString::to_string),
Err(_) => None,
};
let nav = match page.goto(url, timeout).await {
Ok(n) => n,
Err(e) => {
return VerifyResult {
status: Status::Dead,
notes: Some(format!("Navigation error: {e}")),
}
}
};
if nav.error.is_some() {
return VerifyResult {
status: Status::Dead,
notes: nav.error,
};
}
if nav.status == 404 || nav.status >= 500 {
return VerifyResult {
status: Status::Dead,
notes: Some(format!("HTTP {}", nav.status)),
};
}
let final_url = page.current_url().await;
if let (Some(orig), Some(final_u)) = (&original_host, &final_url) {
if let Ok(parsed) = Url::parse(final_u) {
if let Some(final_host) = parsed.host_str() {
let orig_norm = orig.trim_start_matches("www.");
let final_norm = final_host.trim_start_matches("www.");
if orig_norm != final_norm {
return VerifyResult {
status: Status::Redirect,
notes: Some(final_u.clone()),
};
}
}
}
}
let Ok(html) = page.content().await else {
return VerifyResult {
status: Status::Ok,
notes: None,
};
};
if is_paywall(&html) {
return VerifyResult {
status: Status::Paywall,
notes: Some("Paywall detected".to_string()),
};
}
if is_login_wall(&html) {
return VerifyResult {
status: Status::Login,
notes: Some("Login required".to_string()),
};
}
VerifyResult {
status: Status::Ok,
notes: None,
}
}
fn is_paywall(html: &str) -> bool {
let doc = Html::parse_document(html);
let lower = html.to_lowercase();
let paywall_patterns = [
"subscribe to continue",
"subscription required",
"premium content",
"paywall",
"member-only",
"members only",
"unlock this article",
"purchase to read",
"buy now to read",
"paid subscribers",
];
for pattern in paywall_patterns {
if lower.contains(pattern) {
return true;
}
}
let paywall_selectors = [
"[class*='paywall']",
"[id*='paywall']",
"[class*='subscription-wall']",
"[class*='piano-offer']",
"[class*='premium-wall']",
];
for sel_str in paywall_selectors {
if let Ok(sel) = Selector::parse(sel_str) {
if doc.select(&sel).next().is_some() {
return true;
}
}
}
false
}
fn is_login_wall(html: &str) -> bool {
let doc = Html::parse_document(html);
let lower = html.to_lowercase();
let login_patterns = [
"sign in to continue",
"log in to continue",
"login to continue",
"please sign in",
"please log in",
"create an account",
"sign up to view",
"register to view",
"authentication required",
];
for pattern in login_patterns {
if lower.contains(pattern) {
return true;
}
}
let login_selectors = [
"[class*='login-wall']",
"[class*='auth-wall']",
"[class*='signup-wall']",
"[id*='login-modal']",
"[class*='gate-content']",
];
for sel_str in login_selectors {
if let Ok(sel) = Selector::parse(sel_str) {
if doc.select(&sel).next().is_some() {
return true;
}
}
}
false
}
fn truncate(s: &str, max: usize) -> String {
if s.len() <= max {
s.to_string()
} else {
format!("{}...", &s[..max - 3])
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_paywall() {
assert!(is_paywall("<div>Subscribe to continue reading</div>"));
assert!(is_paywall("<div class='paywall-overlay'>content</div>"));
assert!(!is_paywall("<div>Normal content here</div>"));
}
#[test]
fn test_is_login_wall() {
assert!(is_login_wall("<div>Please sign in to continue</div>"));
assert!(is_login_wall("<div class='login-wall'>content</div>"));
assert!(!is_login_wall("<div>Normal content here</div>"));
}
}