use std::time::Duration;
use crate::entity::{Entity, FieldValue};
use crate::parser::ParseError;
use crate::relationship::Rel;
const MAX_URLS_PER_RUN: usize = 2_000;
const MAX_REDIRECTS: usize = 5;
const USER_AGENT: &str = "weave-content/0.2 (+https://github.com/redberrythread/weave)";
#[derive(Debug)]
pub struct UrlCheck {
pub url: String,
pub status: CheckStatus,
pub detail: Option<String>,
pub is_thumbnail: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CheckStatus {
Ok,
Warn,
Error,
}
impl std::fmt::Display for CheckStatus {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::Ok => write!(f, "ok"),
Self::Warn => write!(f, "warn"),
Self::Error => write!(f, "error"),
}
}
}
#[derive(Debug, Clone)]
pub struct UrlEntry {
url: String,
is_thumbnail: bool,
}
impl UrlEntry {
pub fn url(&self) -> &str {
&self.url
}
pub fn is_thumbnail(&self) -> bool {
self.is_thumbnail
}
}
pub fn collect_registry_urls(reg: &crate::registry::EntityRegistry) -> Vec<UrlEntry> {
let mut urls = Vec::new();
let mut seen = std::collections::HashSet::new();
for name in reg.names() {
if let Some(entry) = reg.get_by_name(name) {
for (key, value) in &entry.entity.fields {
if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
&& let FieldValue::Single(url) = value
&& !url.is_empty()
&& seen.insert(url.clone())
{
urls.push(UrlEntry {
url: url.clone(),
is_thumbnail: true,
});
}
}
}
}
urls
}
pub fn collect_urls(
sources: &[crate::parser::SourceEntry],
entities: &[Entity],
rels: &[Rel],
errors: &mut Vec<ParseError>,
) -> Vec<UrlEntry> {
let mut urls = Vec::new();
for source in sources {
urls.push(UrlEntry {
url: source.url().to_string(),
is_thumbnail: false,
});
}
for entity in entities {
for (key, value) in &entity.fields {
match key.as_str() {
"thumbnail" | "thumbnail_source" => {
if let FieldValue::Single(url) = value
&& !url.is_empty()
{
urls.push(UrlEntry {
url: url.clone(),
is_thumbnail: true,
});
}
}
"urls" => {
if let FieldValue::List(items) = value {
for url in items {
urls.push(UrlEntry {
url: url.clone(),
is_thumbnail: false,
});
}
}
}
_ => {}
}
}
}
for rel in rels {
for url in &rel.source_urls {
urls.push(UrlEntry {
url: url.clone(),
is_thumbnail: false,
});
}
}
let mut seen = std::collections::HashSet::new();
urls.retain(|entry| seen.insert(entry.url.clone()));
if urls.len() > MAX_URLS_PER_RUN {
errors.push(ParseError {
line: 0,
message: format!(
"too many URLs to verify (max {MAX_URLS_PER_RUN}, got {})",
urls.len()
),
});
}
urls
}
pub async fn verify_urls(
urls: Vec<UrlEntry>,
concurrency: usize,
timeout_secs: u64,
) -> Vec<UrlCheck> {
let client = reqwest::Client::builder()
.user_agent(USER_AGENT)
.redirect(reqwest::redirect::Policy::limited(MAX_REDIRECTS))
.timeout(Duration::from_secs(timeout_secs))
.build()
.unwrap_or_else(|_| reqwest::Client::new());
let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(concurrency));
let client = std::sync::Arc::new(client);
let mut handles = Vec::new();
for entry in urls {
let sem = semaphore.clone();
let cli = client.clone();
handles.push(tokio::spawn(async move {
let _permit = sem.acquire().await;
check_url(&cli, &entry.url, entry.is_thumbnail).await
}));
}
let mut results = Vec::new();
for handle in handles {
match handle.await {
Ok(check) => results.push(check),
Err(e) => results.push(UrlCheck {
url: "unknown".into(),
status: CheckStatus::Error,
detail: Some(format!("task panicked: {e}")),
is_thumbnail: false,
}),
}
}
results
}
async fn check_url(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
match client.head(url).send().await {
Ok(resp) => {
let status = resp.status();
if status == reqwest::StatusCode::METHOD_NOT_ALLOWED {
return check_url_get(client, url, is_thumbnail).await;
}
evaluate_response(url, status, resp.headers(), is_thumbnail)
}
Err(e) => {
if e.is_timeout() {
UrlCheck {
url: url.to_string(),
status: CheckStatus::Warn,
detail: Some("timeout".into()),
is_thumbnail,
}
} else {
UrlCheck {
url: url.to_string(),
status: CheckStatus::Error,
detail: Some(format!("{e}")),
is_thumbnail,
}
}
}
}
}
async fn check_url_get(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
match client.get(url).send().await {
Ok(resp) => evaluate_response(url, resp.status(), resp.headers(), is_thumbnail),
Err(e) => {
if e.is_timeout() {
UrlCheck {
url: url.to_string(),
status: CheckStatus::Warn,
detail: Some("timeout".into()),
is_thumbnail,
}
} else {
UrlCheck {
url: url.to_string(),
status: CheckStatus::Error,
detail: Some(format!("{e}")),
is_thumbnail,
}
}
}
}
}
fn evaluate_response(
url: &str,
status: reqwest::StatusCode,
headers: &reqwest::header::HeaderMap,
is_thumbnail: bool,
) -> UrlCheck {
if status.is_success() {
if is_thumbnail && let Some(ct) = headers.get(reqwest::header::CONTENT_TYPE) {
let ct_str = ct.to_str().unwrap_or("");
if !ct_str.starts_with("image/") {
return UrlCheck {
url: url.to_string(),
status: CheckStatus::Error,
detail: Some(format!("expected content-type image/*, got {ct_str}")),
is_thumbnail,
};
}
}
UrlCheck {
url: url.to_string(),
status: CheckStatus::Ok,
detail: None,
is_thumbnail,
}
} else if status.is_redirection() {
UrlCheck {
url: url.to_string(),
status: CheckStatus::Warn,
detail: Some(format!("HTTP {status}")),
is_thumbnail,
}
} else {
UrlCheck {
url: url.to_string(),
status: CheckStatus::Error,
detail: Some(format!("HTTP {status}")),
is_thumbnail,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn collect_urls_deduplicates() {
let sources = vec![
crate::parser::SourceEntry::Url("https://a.com".into()),
crate::parser::SourceEntry::Url("https://b.com".into()),
];
let entities = vec![Entity {
name: "Test".into(),
label: crate::entity::Label::Person,
fields: vec![(
"urls".into(),
FieldValue::List(vec!["https://a.com".into(), "https://c.com".into()]),
)],
id: None,
line: 1,
tags: Vec::new(),
slug: None,
}];
let mut errors = Vec::new();
let urls = collect_urls(&sources, &entities, &[], &mut errors);
assert!(errors.is_empty());
assert_eq!(urls.len(), 3);
}
#[test]
fn collect_urls_includes_thumbnails() {
let entities = vec![Entity {
name: "Test".into(),
label: crate::entity::Label::Person,
fields: vec![(
"thumbnail".into(),
FieldValue::Single("https://img.com/photo.jpg".into()),
)],
id: None,
line: 1,
tags: Vec::new(),
slug: None,
}];
let mut errors = Vec::new();
let urls = collect_urls(&[], &entities, &[], &mut errors);
assert_eq!(urls.len(), 1);
assert!(urls[0].is_thumbnail);
}
#[test]
fn collect_urls_includes_rel_sources() {
let rels = vec![Rel {
source_name: "A".into(),
target_name: "B".into(),
rel_type: "associate_of".into(),
source_urls: vec!["https://src.com".into()],
fields: vec![],
id: None,
line: 1,
}];
let mut errors = Vec::new();
let urls = collect_urls(&[], &[], &rels, &mut errors);
assert_eq!(urls.len(), 1);
assert!(!urls[0].is_thumbnail);
}
#[test]
fn collect_urls_boundary() {
let sources: Vec<crate::parser::SourceEntry> = (0..2_001)
.map(|i| crate::parser::SourceEntry::Url(format!("https://example.com/{i}")))
.collect();
let mut errors = Vec::new();
collect_urls(&sources, &[], &[], &mut errors);
assert!(errors.iter().any(|e| e.message.contains("too many URLs")));
}
#[test]
fn evaluate_success() {
let check = evaluate_response(
"https://example.com",
reqwest::StatusCode::OK,
&reqwest::header::HeaderMap::new(),
false,
);
assert_eq!(check.status, CheckStatus::Ok);
}
#[test]
fn evaluate_not_found() {
let check = evaluate_response(
"https://example.com",
reqwest::StatusCode::NOT_FOUND,
&reqwest::header::HeaderMap::new(),
false,
);
assert_eq!(check.status, CheckStatus::Error);
}
#[test]
fn evaluate_thumbnail_wrong_content_type() {
let mut headers = reqwest::header::HeaderMap::new();
headers.insert(
reqwest::header::CONTENT_TYPE,
"text/html".parse().unwrap_or_else(|_| unreachable!()),
);
let check = evaluate_response(
"https://example.com/img.jpg",
reqwest::StatusCode::OK,
&headers,
true,
);
assert_eq!(check.status, CheckStatus::Error);
assert!(check.detail.as_deref().unwrap_or("").contains("image/*"));
}
#[test]
fn evaluate_thumbnail_correct_content_type() {
let mut headers = reqwest::header::HeaderMap::new();
headers.insert(
reqwest::header::CONTENT_TYPE,
"image/jpeg".parse().unwrap_or_else(|_| unreachable!()),
);
let check = evaluate_response(
"https://example.com/img.jpg",
reqwest::StatusCode::OK,
&headers,
true,
);
assert_eq!(check.status, CheckStatus::Ok);
}
#[tokio::test]
async fn verify_urls_with_mock_server_ok() {
let mut server = mockito::Server::new_async().await;
let mock = server
.mock("HEAD", "/page")
.with_status(200)
.create_async()
.await;
let urls = vec![UrlEntry {
url: format!("{}/page", server.url()),
is_thumbnail: false,
}];
let results = verify_urls(urls, 4, 5).await;
assert_eq!(results.len(), 1);
assert_eq!(results[0].status, CheckStatus::Ok);
mock.assert_async().await;
}
#[tokio::test]
async fn verify_urls_with_mock_server_404() {
let mut server = mockito::Server::new_async().await;
let mock = server
.mock("HEAD", "/missing")
.with_status(404)
.create_async()
.await;
let urls = vec![UrlEntry {
url: format!("{}/missing", server.url()),
is_thumbnail: false,
}];
let results = verify_urls(urls, 4, 5).await;
assert_eq!(results.len(), 1);
assert_eq!(results[0].status, CheckStatus::Error);
assert!(results[0].detail.as_deref().unwrap_or("").contains("404"));
mock.assert_async().await;
}
#[tokio::test]
async fn verify_urls_head_405_falls_back_to_get() {
let mut server = mockito::Server::new_async().await;
let head_mock = server
.mock("HEAD", "/no-head")
.with_status(405)
.create_async()
.await;
let get_mock = server
.mock("GET", "/no-head")
.with_status(200)
.create_async()
.await;
let urls = vec![UrlEntry {
url: format!("{}/no-head", server.url()),
is_thumbnail: false,
}];
let results = verify_urls(urls, 4, 5).await;
assert_eq!(results.len(), 1);
assert_eq!(results[0].status, CheckStatus::Ok);
head_mock.assert_async().await;
get_mock.assert_async().await;
}
#[tokio::test]
async fn verify_urls_thumbnail_content_type_check() {
let mut server = mockito::Server::new_async().await;
let mock = server
.mock("HEAD", "/img.jpg")
.with_status(200)
.with_header("content-type", "image/jpeg")
.create_async()
.await;
let urls = vec![UrlEntry {
url: format!("{}/img.jpg", server.url()),
is_thumbnail: true,
}];
let results = verify_urls(urls, 4, 5).await;
assert_eq!(results.len(), 1);
assert_eq!(results[0].status, CheckStatus::Ok);
mock.assert_async().await;
}
#[tokio::test]
async fn verify_urls_thumbnail_wrong_content_type() {
let mut server = mockito::Server::new_async().await;
let mock = server
.mock("HEAD", "/not-image")
.with_status(200)
.with_header("content-type", "text/html")
.create_async()
.await;
let urls = vec![UrlEntry {
url: format!("{}/not-image", server.url()),
is_thumbnail: true,
}];
let results = verify_urls(urls, 4, 5).await;
assert_eq!(results.len(), 1);
assert_eq!(results[0].status, CheckStatus::Error);
assert!(
results[0]
.detail
.as_deref()
.unwrap_or("")
.contains("image/*")
);
mock.assert_async().await;
}
}