1use std::time::Duration;
2
3use crate::entity::{Entity, FieldValue};
4use crate::parser::ParseError;
5use crate::relationship::Rel;
6
7const MAX_URLS_PER_RUN: usize = 2_000;
9
10const MAX_REDIRECTS: usize = 5;
12
13const USER_AGENT: &str = "weave-content/0.2 (+https://github.com/redberrythread/weave)";
15
16#[derive(Debug)]
18pub struct UrlCheck {
19 pub url: String,
20 pub status: CheckStatus,
21 pub detail: Option<String>,
22 pub is_thumbnail: bool,
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum CheckStatus {
29 Ok,
30 Warn,
31 Error,
32}
33
34impl std::fmt::Display for CheckStatus {
35 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36 match self {
37 Self::Ok => write!(f, "ok"),
38 Self::Warn => write!(f, "warn"),
39 Self::Error => write!(f, "error"),
40 }
41 }
42}
43
44#[derive(Debug, Clone)]
46pub struct UrlEntry {
47 url: String,
48 is_thumbnail: bool,
49}
50
51impl UrlEntry {
52 pub fn url(&self) -> &str {
54 &self.url
55 }
56
57 pub fn is_thumbnail(&self) -> bool {
59 self.is_thumbnail
60 }
61}
62
63pub fn collect_registry_urls(reg: &crate::registry::EntityRegistry) -> Vec<UrlEntry> {
65 let mut urls = Vec::new();
66 let mut seen = std::collections::HashSet::new();
67
68 for name in reg.names() {
69 if let Some(entry) = reg.get_by_name(name) {
70 for (key, value) in &entry.entity.fields {
71 if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
72 && let FieldValue::Single(url) = value
73 && !url.is_empty()
74 && seen.insert(url.clone())
75 {
76 urls.push(UrlEntry {
77 url: url.clone(),
78 is_thumbnail: true,
79 });
80 }
81 }
82 }
83 }
84
85 urls
86}
87
88pub fn collect_urls(
90 sources: &[crate::parser::SourceEntry],
91 entities: &[Entity],
92 rels: &[Rel],
93 errors: &mut Vec<ParseError>,
94) -> Vec<UrlEntry> {
95 let mut urls = Vec::new();
96
97 for source in sources {
99 urls.push(UrlEntry {
100 url: source.url().to_string(),
101 is_thumbnail: false,
102 });
103 }
104
105 for entity in entities {
107 for (key, value) in &entity.fields {
108 match key.as_str() {
109 "thumbnail" | "thumbnail_source" => {
110 if let FieldValue::Single(url) = value
111 && !url.is_empty()
112 {
113 urls.push(UrlEntry {
114 url: url.clone(),
115 is_thumbnail: true,
116 });
117 }
118 }
119 "urls" => {
120 if let FieldValue::List(items) = value {
121 for url in items {
122 urls.push(UrlEntry {
123 url: url.clone(),
124 is_thumbnail: false,
125 });
126 }
127 }
128 }
129 _ => {}
130 }
131 }
132 }
133
134 for rel in rels {
136 for url in &rel.source_urls {
137 urls.push(UrlEntry {
138 url: url.clone(),
139 is_thumbnail: false,
140 });
141 }
142 }
143
144 let mut seen = std::collections::HashSet::new();
146 urls.retain(|entry| seen.insert(entry.url.clone()));
147
148 if urls.len() > MAX_URLS_PER_RUN {
150 errors.push(ParseError {
151 line: 0,
152 message: format!(
153 "too many URLs to verify (max {MAX_URLS_PER_RUN}, got {})",
154 urls.len()
155 ),
156 });
157 }
158
159 urls
160}
161
162pub async fn verify_urls(
164 urls: Vec<UrlEntry>,
165 concurrency: usize,
166 timeout_secs: u64,
167) -> Vec<UrlCheck> {
168 let client = reqwest::Client::builder()
169 .user_agent(USER_AGENT)
170 .redirect(reqwest::redirect::Policy::limited(MAX_REDIRECTS))
171 .timeout(Duration::from_secs(timeout_secs))
172 .build()
173 .unwrap_or_else(|_| reqwest::Client::new());
174
175 let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(concurrency));
176 let client = std::sync::Arc::new(client);
177
178 let mut handles = Vec::new();
179
180 for entry in urls {
181 let sem = semaphore.clone();
182 let cli = client.clone();
183 handles.push(tokio::spawn(async move {
184 let _permit = sem.acquire().await;
185 check_url(&cli, &entry.url, entry.is_thumbnail).await
186 }));
187 }
188
189 let mut results = Vec::new();
190 for handle in handles {
191 match handle.await {
192 Ok(check) => results.push(check),
193 Err(e) => results.push(UrlCheck {
194 url: "unknown".into(),
195 status: CheckStatus::Error,
196 detail: Some(format!("task panicked: {e}")),
197 is_thumbnail: false,
198 }),
199 }
200 }
201
202 results
203}
204
205async fn check_url(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
206 match client.head(url).send().await {
208 Ok(resp) => {
209 let status = resp.status();
210
211 if status == reqwest::StatusCode::METHOD_NOT_ALLOWED {
213 return check_url_get(client, url, is_thumbnail).await;
214 }
215
216 evaluate_response(url, status, resp.headers(), is_thumbnail)
217 }
218 Err(e) => {
219 if e.is_timeout() {
220 UrlCheck {
221 url: url.to_string(),
222 status: CheckStatus::Warn,
223 detail: Some("timeout".into()),
224 is_thumbnail,
225 }
226 } else {
227 UrlCheck {
228 url: url.to_string(),
229 status: CheckStatus::Error,
230 detail: Some(format!("{e}")),
231 is_thumbnail,
232 }
233 }
234 }
235 }
236}
237
238async fn check_url_get(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
239 match client.get(url).send().await {
240 Ok(resp) => evaluate_response(url, resp.status(), resp.headers(), is_thumbnail),
241 Err(e) => {
242 if e.is_timeout() {
243 UrlCheck {
244 url: url.to_string(),
245 status: CheckStatus::Warn,
246 detail: Some("timeout".into()),
247 is_thumbnail,
248 }
249 } else {
250 UrlCheck {
251 url: url.to_string(),
252 status: CheckStatus::Error,
253 detail: Some(format!("{e}")),
254 is_thumbnail,
255 }
256 }
257 }
258 }
259}
260
261fn evaluate_response(
262 url: &str,
263 status: reqwest::StatusCode,
264 headers: &reqwest::header::HeaderMap,
265 is_thumbnail: bool,
266) -> UrlCheck {
267 if status.is_success() {
268 if is_thumbnail && let Some(ct) = headers.get(reqwest::header::CONTENT_TYPE) {
270 let ct_str = ct.to_str().unwrap_or("");
271 if !ct_str.starts_with("image/") {
272 return UrlCheck {
273 url: url.to_string(),
274 status: CheckStatus::Error,
275 detail: Some(format!("expected content-type image/*, got {ct_str}")),
276 is_thumbnail,
277 };
278 }
279 }
280
281 UrlCheck {
282 url: url.to_string(),
283 status: CheckStatus::Ok,
284 detail: None,
285 is_thumbnail,
286 }
287 } else if status.is_redirection() {
288 UrlCheck {
290 url: url.to_string(),
291 status: CheckStatus::Warn,
292 detail: Some(format!("HTTP {status}")),
293 is_thumbnail,
294 }
295 } else {
296 UrlCheck {
297 url: url.to_string(),
298 status: CheckStatus::Error,
299 detail: Some(format!("HTTP {status}")),
300 is_thumbnail,
301 }
302 }
303}
304
305#[cfg(test)]
306mod tests {
307 use super::*;
308
309 #[test]
310 fn collect_urls_deduplicates() {
311 let sources = vec![
312 crate::parser::SourceEntry::Url("https://a.com".into()),
313 crate::parser::SourceEntry::Url("https://b.com".into()),
314 ];
315 let entities = vec![Entity {
316 name: "Test".into(),
317 label: crate::entity::Label::Person,
318 fields: vec![(
319 "urls".into(),
320 FieldValue::List(vec!["https://a.com".into(), "https://c.com".into()]),
321 )],
322 id: None,
323 line: 1,
324 tags: Vec::new(),
325 slug: None,
326 }];
327 let mut errors = Vec::new();
328
329 let urls = collect_urls(&sources, &entities, &[], &mut errors);
330 assert!(errors.is_empty());
331 assert_eq!(urls.len(), 3);
333 }
334
335 #[test]
336 fn collect_urls_includes_thumbnails() {
337 let entities = vec![Entity {
338 name: "Test".into(),
339 label: crate::entity::Label::Person,
340 fields: vec![(
341 "thumbnail".into(),
342 FieldValue::Single("https://img.com/photo.jpg".into()),
343 )],
344 id: None,
345 line: 1,
346 tags: Vec::new(),
347 slug: None,
348 }];
349 let mut errors = Vec::new();
350
351 let urls = collect_urls(&[], &entities, &[], &mut errors);
352 assert_eq!(urls.len(), 1);
353 assert!(urls[0].is_thumbnail);
354 }
355
356 #[test]
357 fn collect_urls_includes_rel_sources() {
358 let rels = vec![Rel {
359 source_name: "A".into(),
360 target_name: "B".into(),
361 rel_type: "associate_of".into(),
362 source_urls: vec!["https://src.com".into()],
363 fields: vec![],
364 id: None,
365 line: 1,
366 }];
367 let mut errors = Vec::new();
368
369 let urls = collect_urls(&[], &[], &rels, &mut errors);
370 assert_eq!(urls.len(), 1);
371 assert!(!urls[0].is_thumbnail);
372 }
373
374 #[test]
375 fn collect_urls_boundary() {
376 let sources: Vec<crate::parser::SourceEntry> = (0..2_001)
377 .map(|i| crate::parser::SourceEntry::Url(format!("https://example.com/{i}")))
378 .collect();
379 let mut errors = Vec::new();
380
381 collect_urls(&sources, &[], &[], &mut errors);
382 assert!(errors.iter().any(|e| e.message.contains("too many URLs")));
383 }
384
385 #[test]
386 fn evaluate_success() {
387 let check = evaluate_response(
388 "https://example.com",
389 reqwest::StatusCode::OK,
390 &reqwest::header::HeaderMap::new(),
391 false,
392 );
393 assert_eq!(check.status, CheckStatus::Ok);
394 }
395
396 #[test]
397 fn evaluate_not_found() {
398 let check = evaluate_response(
399 "https://example.com",
400 reqwest::StatusCode::NOT_FOUND,
401 &reqwest::header::HeaderMap::new(),
402 false,
403 );
404 assert_eq!(check.status, CheckStatus::Error);
405 }
406
407 #[test]
408 fn evaluate_thumbnail_wrong_content_type() {
409 let mut headers = reqwest::header::HeaderMap::new();
410 headers.insert(
411 reqwest::header::CONTENT_TYPE,
412 "text/html".parse().unwrap_or_else(|_| unreachable!()),
413 );
414 let check = evaluate_response(
415 "https://example.com/img.jpg",
416 reqwest::StatusCode::OK,
417 &headers,
418 true,
419 );
420 assert_eq!(check.status, CheckStatus::Error);
421 assert!(check.detail.as_deref().unwrap_or("").contains("image/*"));
422 }
423
424 #[test]
425 fn evaluate_thumbnail_correct_content_type() {
426 let mut headers = reqwest::header::HeaderMap::new();
427 headers.insert(
428 reqwest::header::CONTENT_TYPE,
429 "image/jpeg".parse().unwrap_or_else(|_| unreachable!()),
430 );
431 let check = evaluate_response(
432 "https://example.com/img.jpg",
433 reqwest::StatusCode::OK,
434 &headers,
435 true,
436 );
437 assert_eq!(check.status, CheckStatus::Ok);
438 }
439
440 #[tokio::test]
441 async fn verify_urls_with_mock_server_ok() {
442 let mut server = mockito::Server::new_async().await;
443 let mock = server
444 .mock("HEAD", "/page")
445 .with_status(200)
446 .create_async()
447 .await;
448
449 let urls = vec![UrlEntry {
450 url: format!("{}/page", server.url()),
451 is_thumbnail: false,
452 }];
453
454 let results = verify_urls(urls, 4, 5).await;
455 assert_eq!(results.len(), 1);
456 assert_eq!(results[0].status, CheckStatus::Ok);
457 mock.assert_async().await;
458 }
459
460 #[tokio::test]
461 async fn verify_urls_with_mock_server_404() {
462 let mut server = mockito::Server::new_async().await;
463 let mock = server
464 .mock("HEAD", "/missing")
465 .with_status(404)
466 .create_async()
467 .await;
468
469 let urls = vec![UrlEntry {
470 url: format!("{}/missing", server.url()),
471 is_thumbnail: false,
472 }];
473
474 let results = verify_urls(urls, 4, 5).await;
475 assert_eq!(results.len(), 1);
476 assert_eq!(results[0].status, CheckStatus::Error);
477 assert!(results[0].detail.as_deref().unwrap_or("").contains("404"));
478 mock.assert_async().await;
479 }
480
481 #[tokio::test]
482 async fn verify_urls_head_405_falls_back_to_get() {
483 let mut server = mockito::Server::new_async().await;
484 let head_mock = server
485 .mock("HEAD", "/no-head")
486 .with_status(405)
487 .create_async()
488 .await;
489 let get_mock = server
490 .mock("GET", "/no-head")
491 .with_status(200)
492 .create_async()
493 .await;
494
495 let urls = vec![UrlEntry {
496 url: format!("{}/no-head", server.url()),
497 is_thumbnail: false,
498 }];
499
500 let results = verify_urls(urls, 4, 5).await;
501 assert_eq!(results.len(), 1);
502 assert_eq!(results[0].status, CheckStatus::Ok);
503 head_mock.assert_async().await;
504 get_mock.assert_async().await;
505 }
506
507 #[tokio::test]
508 async fn verify_urls_thumbnail_content_type_check() {
509 let mut server = mockito::Server::new_async().await;
510 let mock = server
511 .mock("HEAD", "/img.jpg")
512 .with_status(200)
513 .with_header("content-type", "image/jpeg")
514 .create_async()
515 .await;
516
517 let urls = vec![UrlEntry {
518 url: format!("{}/img.jpg", server.url()),
519 is_thumbnail: true,
520 }];
521
522 let results = verify_urls(urls, 4, 5).await;
523 assert_eq!(results.len(), 1);
524 assert_eq!(results[0].status, CheckStatus::Ok);
525 mock.assert_async().await;
526 }
527
528 #[tokio::test]
529 async fn verify_urls_thumbnail_wrong_content_type() {
530 let mut server = mockito::Server::new_async().await;
531 let mock = server
532 .mock("HEAD", "/not-image")
533 .with_status(200)
534 .with_header("content-type", "text/html")
535 .create_async()
536 .await;
537
538 let urls = vec![UrlEntry {
539 url: format!("{}/not-image", server.url()),
540 is_thumbnail: true,
541 }];
542
543 let results = verify_urls(urls, 4, 5).await;
544 assert_eq!(results.len(), 1);
545 assert_eq!(results[0].status, CheckStatus::Error);
546 assert!(
547 results[0]
548 .detail
549 .as_deref()
550 .unwrap_or("")
551 .contains("image/*")
552 );
553 mock.assert_async().await;
554 }
555}