1use std::time::Duration;
2
3use crate::entity::{Entity, FieldValue};
4use crate::parser::ParseError;
5use crate::relationship::Rel;
6
7const MAX_URLS_PER_RUN: usize = 500;
9
10const MAX_REDIRECTS: usize = 5;
12
13const USER_AGENT: &str = "weave-content/0.2 (+https://github.com/redberrythread/weave)";
15
16#[derive(Debug)]
18pub struct UrlCheck {
19 pub url: String,
20 pub status: CheckStatus,
21 pub detail: Option<String>,
22 pub is_thumbnail: bool,
24}
25
26#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum CheckStatus {
29 Ok,
30 Warn,
31 Error,
32}
33
34impl std::fmt::Display for CheckStatus {
35 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36 match self {
37 Self::Ok => write!(f, "ok"),
38 Self::Warn => write!(f, "warn"),
39 Self::Error => write!(f, "error"),
40 }
41 }
42}
43
44#[derive(Debug, Clone)]
46pub struct UrlEntry {
47 url: String,
48 is_thumbnail: bool,
49}
50
51impl UrlEntry {
52 pub fn url(&self) -> &str {
54 &self.url
55 }
56
57 pub fn is_thumbnail(&self) -> bool {
59 self.is_thumbnail
60 }
61}
62
63pub fn collect_urls(
65 sources: &[String],
66 entities: &[Entity],
67 rels: &[Rel],
68 errors: &mut Vec<ParseError>,
69) -> Vec<UrlEntry> {
70 let mut urls = Vec::new();
71
72 for url in sources {
74 urls.push(UrlEntry {
75 url: url.clone(),
76 is_thumbnail: false,
77 });
78 }
79
80 for entity in entities {
82 for (key, value) in &entity.fields {
83 match key.as_str() {
84 "thumbnail" | "thumbnail_source" => {
85 if let FieldValue::Single(url) = value
86 && !url.is_empty()
87 {
88 urls.push(UrlEntry {
89 url: url.clone(),
90 is_thumbnail: true,
91 });
92 }
93 }
94 "urls" => {
95 if let FieldValue::List(items) = value {
96 for url in items {
97 urls.push(UrlEntry {
98 url: url.clone(),
99 is_thumbnail: false,
100 });
101 }
102 }
103 }
104 _ => {}
105 }
106 }
107 }
108
109 for rel in rels {
111 for url in &rel.source_urls {
112 urls.push(UrlEntry {
113 url: url.clone(),
114 is_thumbnail: false,
115 });
116 }
117 }
118
119 let mut seen = std::collections::HashSet::new();
121 urls.retain(|entry| seen.insert(entry.url.clone()));
122
123 if urls.len() > MAX_URLS_PER_RUN {
125 errors.push(ParseError {
126 line: 0,
127 message: format!(
128 "too many URLs to verify (max {MAX_URLS_PER_RUN}, got {})",
129 urls.len()
130 ),
131 });
132 }
133
134 urls
135}
136
137pub async fn verify_urls(
139 urls: Vec<UrlEntry>,
140 concurrency: usize,
141 timeout_secs: u64,
142) -> Vec<UrlCheck> {
143 let client = reqwest::Client::builder()
144 .user_agent(USER_AGENT)
145 .redirect(reqwest::redirect::Policy::limited(MAX_REDIRECTS))
146 .timeout(Duration::from_secs(timeout_secs))
147 .build()
148 .unwrap_or_else(|_| reqwest::Client::new());
149
150 let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(concurrency));
151 let client = std::sync::Arc::new(client);
152
153 let mut handles = Vec::new();
154
155 for entry in urls {
156 let sem = semaphore.clone();
157 let cli = client.clone();
158 handles.push(tokio::spawn(async move {
159 let _permit = sem.acquire().await;
160 check_url(&cli, &entry.url, entry.is_thumbnail).await
161 }));
162 }
163
164 let mut results = Vec::new();
165 for handle in handles {
166 match handle.await {
167 Ok(check) => results.push(check),
168 Err(e) => results.push(UrlCheck {
169 url: "unknown".into(),
170 status: CheckStatus::Error,
171 detail: Some(format!("task panicked: {e}")),
172 is_thumbnail: false,
173 }),
174 }
175 }
176
177 results
178}
179
180async fn check_url(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
181 match client.head(url).send().await {
183 Ok(resp) => {
184 let status = resp.status();
185
186 if status == reqwest::StatusCode::METHOD_NOT_ALLOWED {
188 return check_url_get(client, url, is_thumbnail).await;
189 }
190
191 evaluate_response(url, status, resp.headers(), is_thumbnail)
192 }
193 Err(e) => {
194 if e.is_timeout() {
195 UrlCheck {
196 url: url.to_string(),
197 status: CheckStatus::Warn,
198 detail: Some("timeout".into()),
199 is_thumbnail,
200 }
201 } else {
202 UrlCheck {
203 url: url.to_string(),
204 status: CheckStatus::Error,
205 detail: Some(format!("{e}")),
206 is_thumbnail,
207 }
208 }
209 }
210 }
211}
212
213async fn check_url_get(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
214 match client.get(url).send().await {
215 Ok(resp) => evaluate_response(url, resp.status(), resp.headers(), is_thumbnail),
216 Err(e) => {
217 if e.is_timeout() {
218 UrlCheck {
219 url: url.to_string(),
220 status: CheckStatus::Warn,
221 detail: Some("timeout".into()),
222 is_thumbnail,
223 }
224 } else {
225 UrlCheck {
226 url: url.to_string(),
227 status: CheckStatus::Error,
228 detail: Some(format!("{e}")),
229 is_thumbnail,
230 }
231 }
232 }
233 }
234}
235
236fn evaluate_response(
237 url: &str,
238 status: reqwest::StatusCode,
239 headers: &reqwest::header::HeaderMap,
240 is_thumbnail: bool,
241) -> UrlCheck {
242 if status.is_success() {
243 if is_thumbnail && let Some(ct) = headers.get(reqwest::header::CONTENT_TYPE) {
245 let ct_str = ct.to_str().unwrap_or("");
246 if !ct_str.starts_with("image/") {
247 return UrlCheck {
248 url: url.to_string(),
249 status: CheckStatus::Error,
250 detail: Some(format!("expected content-type image/*, got {ct_str}")),
251 is_thumbnail,
252 };
253 }
254 }
255
256 UrlCheck {
257 url: url.to_string(),
258 status: CheckStatus::Ok,
259 detail: None,
260 is_thumbnail,
261 }
262 } else if status.is_redirection() {
263 UrlCheck {
265 url: url.to_string(),
266 status: CheckStatus::Warn,
267 detail: Some(format!("HTTP {status}")),
268 is_thumbnail,
269 }
270 } else {
271 UrlCheck {
272 url: url.to_string(),
273 status: CheckStatus::Error,
274 detail: Some(format!("HTTP {status}")),
275 is_thumbnail,
276 }
277 }
278}
279
280#[cfg(test)]
281mod tests {
282 use super::*;
283
284 #[test]
285 fn collect_urls_deduplicates() {
286 let sources = vec!["https://a.com".into(), "https://b.com".into()];
287 let entities = vec![Entity {
288 name: "Test".into(),
289 label: crate::entity::Label::Actor,
290 fields: vec![(
291 "urls".into(),
292 FieldValue::List(vec!["https://a.com".into(), "https://c.com".into()]),
293 )],
294 id: None,
295 line: 1,
296 }];
297 let mut errors = Vec::new();
298
299 let urls = collect_urls(&sources, &entities, &[], &mut errors);
300 assert!(errors.is_empty());
301 assert_eq!(urls.len(), 3);
303 }
304
305 #[test]
306 fn collect_urls_includes_thumbnails() {
307 let entities = vec![Entity {
308 name: "Test".into(),
309 label: crate::entity::Label::Actor,
310 fields: vec![(
311 "thumbnail".into(),
312 FieldValue::Single("https://img.com/photo.jpg".into()),
313 )],
314 id: None,
315 line: 1,
316 }];
317 let mut errors = Vec::new();
318
319 let urls = collect_urls(&[], &entities, &[], &mut errors);
320 assert_eq!(urls.len(), 1);
321 assert!(urls[0].is_thumbnail);
322 }
323
324 #[test]
325 fn collect_urls_includes_rel_sources() {
326 let rels = vec![Rel {
327 source_name: "A".into(),
328 target_name: "B".into(),
329 rel_type: "related_to".into(),
330 source_urls: vec!["https://src.com".into()],
331 fields: vec![],
332 id: None,
333 line: 1,
334 }];
335 let mut errors = Vec::new();
336
337 let urls = collect_urls(&[], &[], &rels, &mut errors);
338 assert_eq!(urls.len(), 1);
339 assert!(!urls[0].is_thumbnail);
340 }
341
342 #[test]
343 fn collect_urls_boundary() {
344 let sources: Vec<String> = (0..501)
345 .map(|i| format!("https://example.com/{i}"))
346 .collect();
347 let mut errors = Vec::new();
348
349 collect_urls(&sources, &[], &[], &mut errors);
350 assert!(errors.iter().any(|e| e.message.contains("too many URLs")));
351 }
352
353 #[test]
354 fn evaluate_success() {
355 let check = evaluate_response(
356 "https://example.com",
357 reqwest::StatusCode::OK,
358 &reqwest::header::HeaderMap::new(),
359 false,
360 );
361 assert_eq!(check.status, CheckStatus::Ok);
362 }
363
364 #[test]
365 fn evaluate_not_found() {
366 let check = evaluate_response(
367 "https://example.com",
368 reqwest::StatusCode::NOT_FOUND,
369 &reqwest::header::HeaderMap::new(),
370 false,
371 );
372 assert_eq!(check.status, CheckStatus::Error);
373 }
374
375 #[test]
376 fn evaluate_thumbnail_wrong_content_type() {
377 let mut headers = reqwest::header::HeaderMap::new();
378 headers.insert(
379 reqwest::header::CONTENT_TYPE,
380 "text/html".parse().unwrap_or_else(|_| unreachable!()),
381 );
382 let check = evaluate_response(
383 "https://example.com/img.jpg",
384 reqwest::StatusCode::OK,
385 &headers,
386 true,
387 );
388 assert_eq!(check.status, CheckStatus::Error);
389 assert!(check.detail.as_deref().unwrap_or("").contains("image/*"));
390 }
391
392 #[test]
393 fn evaluate_thumbnail_correct_content_type() {
394 let mut headers = reqwest::header::HeaderMap::new();
395 headers.insert(
396 reqwest::header::CONTENT_TYPE,
397 "image/jpeg".parse().unwrap_or_else(|_| unreachable!()),
398 );
399 let check = evaluate_response(
400 "https://example.com/img.jpg",
401 reqwest::StatusCode::OK,
402 &headers,
403 true,
404 );
405 assert_eq!(check.status, CheckStatus::Ok);
406 }
407
408 #[tokio::test]
409 async fn verify_urls_with_mock_server_ok() {
410 let mut server = mockito::Server::new_async().await;
411 let mock = server
412 .mock("HEAD", "/page")
413 .with_status(200)
414 .create_async()
415 .await;
416
417 let urls = vec![UrlEntry {
418 url: format!("{}/page", server.url()),
419 is_thumbnail: false,
420 }];
421
422 let results = verify_urls(urls, 4, 5).await;
423 assert_eq!(results.len(), 1);
424 assert_eq!(results[0].status, CheckStatus::Ok);
425 mock.assert_async().await;
426 }
427
428 #[tokio::test]
429 async fn verify_urls_with_mock_server_404() {
430 let mut server = mockito::Server::new_async().await;
431 let mock = server
432 .mock("HEAD", "/missing")
433 .with_status(404)
434 .create_async()
435 .await;
436
437 let urls = vec![UrlEntry {
438 url: format!("{}/missing", server.url()),
439 is_thumbnail: false,
440 }];
441
442 let results = verify_urls(urls, 4, 5).await;
443 assert_eq!(results.len(), 1);
444 assert_eq!(results[0].status, CheckStatus::Error);
445 assert!(results[0].detail.as_deref().unwrap_or("").contains("404"));
446 mock.assert_async().await;
447 }
448
449 #[tokio::test]
450 async fn verify_urls_head_405_falls_back_to_get() {
451 let mut server = mockito::Server::new_async().await;
452 let head_mock = server
453 .mock("HEAD", "/no-head")
454 .with_status(405)
455 .create_async()
456 .await;
457 let get_mock = server
458 .mock("GET", "/no-head")
459 .with_status(200)
460 .create_async()
461 .await;
462
463 let urls = vec![UrlEntry {
464 url: format!("{}/no-head", server.url()),
465 is_thumbnail: false,
466 }];
467
468 let results = verify_urls(urls, 4, 5).await;
469 assert_eq!(results.len(), 1);
470 assert_eq!(results[0].status, CheckStatus::Ok);
471 head_mock.assert_async().await;
472 get_mock.assert_async().await;
473 }
474
475 #[tokio::test]
476 async fn verify_urls_thumbnail_content_type_check() {
477 let mut server = mockito::Server::new_async().await;
478 let mock = server
479 .mock("HEAD", "/img.jpg")
480 .with_status(200)
481 .with_header("content-type", "image/jpeg")
482 .create_async()
483 .await;
484
485 let urls = vec![UrlEntry {
486 url: format!("{}/img.jpg", server.url()),
487 is_thumbnail: true,
488 }];
489
490 let results = verify_urls(urls, 4, 5).await;
491 assert_eq!(results.len(), 1);
492 assert_eq!(results[0].status, CheckStatus::Ok);
493 mock.assert_async().await;
494 }
495
496 #[tokio::test]
497 async fn verify_urls_thumbnail_wrong_content_type() {
498 let mut server = mockito::Server::new_async().await;
499 let mock = server
500 .mock("HEAD", "/not-image")
501 .with_status(200)
502 .with_header("content-type", "text/html")
503 .create_async()
504 .await;
505
506 let urls = vec![UrlEntry {
507 url: format!("{}/not-image", server.url()),
508 is_thumbnail: true,
509 }];
510
511 let results = verify_urls(urls, 4, 5).await;
512 assert_eq!(results.len(), 1);
513 assert_eq!(results[0].status, CheckStatus::Error);
514 assert!(
515 results[0]
516 .detail
517 .as_deref()
518 .unwrap_or("")
519 .contains("image/*")
520 );
521 mock.assert_async().await;
522 }
523}