1use std::time::Duration;
2
3use crate::entity::{Entity, FieldValue};
4use crate::parser::ParseError;
5use crate::relationship::Rel;
6
7const MAX_URLS_PER_RUN: usize = 500;
9
10const MAX_REDIRECTS: usize = 5;
12
13const USER_AGENT: &str = "weave-content/0.2 (+https://github.com/redberrythread/weave)";
15
16#[derive(Debug)]
18pub struct UrlCheck {
19 pub url: String,
20 pub status: CheckStatus,
21 pub detail: Option<String>,
22 pub is_thumbnail: bool,
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum CheckStatus {
29 Ok,
30 Warn,
31 Error,
32}
33
34impl std::fmt::Display for CheckStatus {
35 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36 match self {
37 Self::Ok => write!(f, "ok"),
38 Self::Warn => write!(f, "warn"),
39 Self::Error => write!(f, "error"),
40 }
41 }
42}
43
44#[derive(Debug, Clone)]
46pub struct UrlEntry {
47 url: String,
48 is_thumbnail: bool,
49}
50
51impl UrlEntry {
52 pub fn url(&self) -> &str {
54 &self.url
55 }
56
57 pub fn is_thumbnail(&self) -> bool {
59 self.is_thumbnail
60 }
61}
62
63pub fn collect_registry_urls(reg: &crate::registry::EntityRegistry) -> Vec<UrlEntry> {
65 let mut urls = Vec::new();
66 let mut seen = std::collections::HashSet::new();
67
68 for name in reg.names() {
69 if let Some(entry) = reg.get_by_name(name) {
70 for (key, value) in &entry.entity.fields {
71 if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
72 && let FieldValue::Single(url) = value
73 && !url.is_empty()
74 && seen.insert(url.clone())
75 {
76 urls.push(UrlEntry {
77 url: url.clone(),
78 is_thumbnail: true,
79 });
80 }
81 }
82 }
83 }
84
85 urls
86}
87
88pub fn collect_urls(
90 sources: &[String],
91 entities: &[Entity],
92 rels: &[Rel],
93 errors: &mut Vec<ParseError>,
94) -> Vec<UrlEntry> {
95 let mut urls = Vec::new();
96
97 for url in sources {
99 urls.push(UrlEntry {
100 url: url.clone(),
101 is_thumbnail: false,
102 });
103 }
104
105 for entity in entities {
107 for (key, value) in &entity.fields {
108 match key.as_str() {
109 "thumbnail" | "thumbnail_source" => {
110 if let FieldValue::Single(url) = value
111 && !url.is_empty()
112 {
113 urls.push(UrlEntry {
114 url: url.clone(),
115 is_thumbnail: true,
116 });
117 }
118 }
119 "urls" => {
120 if let FieldValue::List(items) = value {
121 for url in items {
122 urls.push(UrlEntry {
123 url: url.clone(),
124 is_thumbnail: false,
125 });
126 }
127 }
128 }
129 _ => {}
130 }
131 }
132 }
133
134 for rel in rels {
136 for url in &rel.source_urls {
137 urls.push(UrlEntry {
138 url: url.clone(),
139 is_thumbnail: false,
140 });
141 }
142 }
143
144 let mut seen = std::collections::HashSet::new();
146 urls.retain(|entry| seen.insert(entry.url.clone()));
147
148 if urls.len() > MAX_URLS_PER_RUN {
150 errors.push(ParseError {
151 line: 0,
152 message: format!(
153 "too many URLs to verify (max {MAX_URLS_PER_RUN}, got {})",
154 urls.len()
155 ),
156 });
157 }
158
159 urls
160}
161
162pub async fn verify_urls(
164 urls: Vec<UrlEntry>,
165 concurrency: usize,
166 timeout_secs: u64,
167) -> Vec<UrlCheck> {
168 let client = reqwest::Client::builder()
169 .user_agent(USER_AGENT)
170 .redirect(reqwest::redirect::Policy::limited(MAX_REDIRECTS))
171 .timeout(Duration::from_secs(timeout_secs))
172 .build()
173 .unwrap_or_else(|_| reqwest::Client::new());
174
175 let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(concurrency));
176 let client = std::sync::Arc::new(client);
177
178 let mut handles = Vec::new();
179
180 for entry in urls {
181 let sem = semaphore.clone();
182 let cli = client.clone();
183 handles.push(tokio::spawn(async move {
184 let _permit = sem.acquire().await;
185 check_url(&cli, &entry.url, entry.is_thumbnail).await
186 }));
187 }
188
189 let mut results = Vec::new();
190 for handle in handles {
191 match handle.await {
192 Ok(check) => results.push(check),
193 Err(e) => results.push(UrlCheck {
194 url: "unknown".into(),
195 status: CheckStatus::Error,
196 detail: Some(format!("task panicked: {e}")),
197 is_thumbnail: false,
198 }),
199 }
200 }
201
202 results
203}
204
205async fn check_url(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
206 match client.head(url).send().await {
208 Ok(resp) => {
209 let status = resp.status();
210
211 if status == reqwest::StatusCode::METHOD_NOT_ALLOWED {
213 return check_url_get(client, url, is_thumbnail).await;
214 }
215
216 evaluate_response(url, status, resp.headers(), is_thumbnail)
217 }
218 Err(e) => {
219 if e.is_timeout() {
220 UrlCheck {
221 url: url.to_string(),
222 status: CheckStatus::Warn,
223 detail: Some("timeout".into()),
224 is_thumbnail,
225 }
226 } else {
227 UrlCheck {
228 url: url.to_string(),
229 status: CheckStatus::Error,
230 detail: Some(format!("{e}")),
231 is_thumbnail,
232 }
233 }
234 }
235 }
236}
237
238async fn check_url_get(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
239 match client.get(url).send().await {
240 Ok(resp) => evaluate_response(url, resp.status(), resp.headers(), is_thumbnail),
241 Err(e) => {
242 if e.is_timeout() {
243 UrlCheck {
244 url: url.to_string(),
245 status: CheckStatus::Warn,
246 detail: Some("timeout".into()),
247 is_thumbnail,
248 }
249 } else {
250 UrlCheck {
251 url: url.to_string(),
252 status: CheckStatus::Error,
253 detail: Some(format!("{e}")),
254 is_thumbnail,
255 }
256 }
257 }
258 }
259}
260
261fn evaluate_response(
262 url: &str,
263 status: reqwest::StatusCode,
264 headers: &reqwest::header::HeaderMap,
265 is_thumbnail: bool,
266) -> UrlCheck {
267 if status.is_success() {
268 if is_thumbnail && let Some(ct) = headers.get(reqwest::header::CONTENT_TYPE) {
270 let ct_str = ct.to_str().unwrap_or("");
271 if !ct_str.starts_with("image/") {
272 return UrlCheck {
273 url: url.to_string(),
274 status: CheckStatus::Error,
275 detail: Some(format!("expected content-type image/*, got {ct_str}")),
276 is_thumbnail,
277 };
278 }
279 }
280
281 UrlCheck {
282 url: url.to_string(),
283 status: CheckStatus::Ok,
284 detail: None,
285 is_thumbnail,
286 }
287 } else if status.is_redirection() {
288 UrlCheck {
290 url: url.to_string(),
291 status: CheckStatus::Warn,
292 detail: Some(format!("HTTP {status}")),
293 is_thumbnail,
294 }
295 } else {
296 UrlCheck {
297 url: url.to_string(),
298 status: CheckStatus::Error,
299 detail: Some(format!("HTTP {status}")),
300 is_thumbnail,
301 }
302 }
303}
304
305#[cfg(test)]
306mod tests {
307 use super::*;
308
309 #[test]
310 fn collect_urls_deduplicates() {
311 let sources = vec!["https://a.com".into(), "https://b.com".into()];
312 let entities = vec![Entity {
313 name: "Test".into(),
314 label: crate::entity::Label::Actor,
315 fields: vec![(
316 "urls".into(),
317 FieldValue::List(vec!["https://a.com".into(), "https://c.com".into()]),
318 )],
319 id: None,
320 line: 1,
321 }];
322 let mut errors = Vec::new();
323
324 let urls = collect_urls(&sources, &entities, &[], &mut errors);
325 assert!(errors.is_empty());
326 assert_eq!(urls.len(), 3);
328 }
329
330 #[test]
331 fn collect_urls_includes_thumbnails() {
332 let entities = vec![Entity {
333 name: "Test".into(),
334 label: crate::entity::Label::Actor,
335 fields: vec![(
336 "thumbnail".into(),
337 FieldValue::Single("https://img.com/photo.jpg".into()),
338 )],
339 id: None,
340 line: 1,
341 }];
342 let mut errors = Vec::new();
343
344 let urls = collect_urls(&[], &entities, &[], &mut errors);
345 assert_eq!(urls.len(), 1);
346 assert!(urls[0].is_thumbnail);
347 }
348
349 #[test]
350 fn collect_urls_includes_rel_sources() {
351 let rels = vec![Rel {
352 source_name: "A".into(),
353 target_name: "B".into(),
354 rel_type: "related_to".into(),
355 source_urls: vec!["https://src.com".into()],
356 fields: vec![],
357 id: None,
358 line: 1,
359 }];
360 let mut errors = Vec::new();
361
362 let urls = collect_urls(&[], &[], &rels, &mut errors);
363 assert_eq!(urls.len(), 1);
364 assert!(!urls[0].is_thumbnail);
365 }
366
367 #[test]
368 fn collect_urls_boundary() {
369 let sources: Vec<String> = (0..501)
370 .map(|i| format!("https://example.com/{i}"))
371 .collect();
372 let mut errors = Vec::new();
373
374 collect_urls(&sources, &[], &[], &mut errors);
375 assert!(errors.iter().any(|e| e.message.contains("too many URLs")));
376 }
377
378 #[test]
379 fn evaluate_success() {
380 let check = evaluate_response(
381 "https://example.com",
382 reqwest::StatusCode::OK,
383 &reqwest::header::HeaderMap::new(),
384 false,
385 );
386 assert_eq!(check.status, CheckStatus::Ok);
387 }
388
389 #[test]
390 fn evaluate_not_found() {
391 let check = evaluate_response(
392 "https://example.com",
393 reqwest::StatusCode::NOT_FOUND,
394 &reqwest::header::HeaderMap::new(),
395 false,
396 );
397 assert_eq!(check.status, CheckStatus::Error);
398 }
399
400 #[test]
401 fn evaluate_thumbnail_wrong_content_type() {
402 let mut headers = reqwest::header::HeaderMap::new();
403 headers.insert(
404 reqwest::header::CONTENT_TYPE,
405 "text/html".parse().unwrap_or_else(|_| unreachable!()),
406 );
407 let check = evaluate_response(
408 "https://example.com/img.jpg",
409 reqwest::StatusCode::OK,
410 &headers,
411 true,
412 );
413 assert_eq!(check.status, CheckStatus::Error);
414 assert!(check.detail.as_deref().unwrap_or("").contains("image/*"));
415 }
416
417 #[test]
418 fn evaluate_thumbnail_correct_content_type() {
419 let mut headers = reqwest::header::HeaderMap::new();
420 headers.insert(
421 reqwest::header::CONTENT_TYPE,
422 "image/jpeg".parse().unwrap_or_else(|_| unreachable!()),
423 );
424 let check = evaluate_response(
425 "https://example.com/img.jpg",
426 reqwest::StatusCode::OK,
427 &headers,
428 true,
429 );
430 assert_eq!(check.status, CheckStatus::Ok);
431 }
432
433 #[tokio::test]
434 async fn verify_urls_with_mock_server_ok() {
435 let mut server = mockito::Server::new_async().await;
436 let mock = server
437 .mock("HEAD", "/page")
438 .with_status(200)
439 .create_async()
440 .await;
441
442 let urls = vec![UrlEntry {
443 url: format!("{}/page", server.url()),
444 is_thumbnail: false,
445 }];
446
447 let results = verify_urls(urls, 4, 5).await;
448 assert_eq!(results.len(), 1);
449 assert_eq!(results[0].status, CheckStatus::Ok);
450 mock.assert_async().await;
451 }
452
453 #[tokio::test]
454 async fn verify_urls_with_mock_server_404() {
455 let mut server = mockito::Server::new_async().await;
456 let mock = server
457 .mock("HEAD", "/missing")
458 .with_status(404)
459 .create_async()
460 .await;
461
462 let urls = vec![UrlEntry {
463 url: format!("{}/missing", server.url()),
464 is_thumbnail: false,
465 }];
466
467 let results = verify_urls(urls, 4, 5).await;
468 assert_eq!(results.len(), 1);
469 assert_eq!(results[0].status, CheckStatus::Error);
470 assert!(results[0].detail.as_deref().unwrap_or("").contains("404"));
471 mock.assert_async().await;
472 }
473
474 #[tokio::test]
475 async fn verify_urls_head_405_falls_back_to_get() {
476 let mut server = mockito::Server::new_async().await;
477 let head_mock = server
478 .mock("HEAD", "/no-head")
479 .with_status(405)
480 .create_async()
481 .await;
482 let get_mock = server
483 .mock("GET", "/no-head")
484 .with_status(200)
485 .create_async()
486 .await;
487
488 let urls = vec![UrlEntry {
489 url: format!("{}/no-head", server.url()),
490 is_thumbnail: false,
491 }];
492
493 let results = verify_urls(urls, 4, 5).await;
494 assert_eq!(results.len(), 1);
495 assert_eq!(results[0].status, CheckStatus::Ok);
496 head_mock.assert_async().await;
497 get_mock.assert_async().await;
498 }
499
500 #[tokio::test]
501 async fn verify_urls_thumbnail_content_type_check() {
502 let mut server = mockito::Server::new_async().await;
503 let mock = server
504 .mock("HEAD", "/img.jpg")
505 .with_status(200)
506 .with_header("content-type", "image/jpeg")
507 .create_async()
508 .await;
509
510 let urls = vec![UrlEntry {
511 url: format!("{}/img.jpg", server.url()),
512 is_thumbnail: true,
513 }];
514
515 let results = verify_urls(urls, 4, 5).await;
516 assert_eq!(results.len(), 1);
517 assert_eq!(results[0].status, CheckStatus::Ok);
518 mock.assert_async().await;
519 }
520
521 #[tokio::test]
522 async fn verify_urls_thumbnail_wrong_content_type() {
523 let mut server = mockito::Server::new_async().await;
524 let mock = server
525 .mock("HEAD", "/not-image")
526 .with_status(200)
527 .with_header("content-type", "text/html")
528 .create_async()
529 .await;
530
531 let urls = vec![UrlEntry {
532 url: format!("{}/not-image", server.url()),
533 is_thumbnail: true,
534 }];
535
536 let results = verify_urls(urls, 4, 5).await;
537 assert_eq!(results.len(), 1);
538 assert_eq!(results[0].status, CheckStatus::Error);
539 assert!(
540 results[0]
541 .detail
542 .as_deref()
543 .unwrap_or("")
544 .contains("image/*")
545 );
546 mock.assert_async().await;
547 }
548}