Skip to main content

weave_content/
verifier.rs

1use std::time::Duration;
2
3use crate::entity::{Entity, FieldValue};
4use crate::parser::ParseError;
5use crate::relationship::Rel;
6
7/// Maximum total URLs per verify run.
8const MAX_URLS_PER_RUN: usize = 500;
9
10/// Maximum redirect hops.
11const MAX_REDIRECTS: usize = 5;
12
13/// User-Agent header.
14const USER_AGENT: &str = "weave-content/0.2 (+https://github.com/redberrythread/weave)";
15
16/// Result of checking a single URL.
17#[derive(Debug)]
18pub struct UrlCheck {
19    pub url: String,
20    pub status: CheckStatus,
21    pub detail: Option<String>,
22    /// Whether this URL is a thumbnail (needs content-type check).
23    pub is_thumbnail: bool,
24}
25
26/// Severity level of a URL check result.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum CheckStatus {
29    Ok,
30    Warn,
31    Error,
32}
33
34impl std::fmt::Display for CheckStatus {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        match self {
37            Self::Ok => write!(f, "ok"),
38            Self::Warn => write!(f, "warn"),
39            Self::Error => write!(f, "error"),
40        }
41    }
42}
43
44/// Collected URL to verify with its source location.
45#[derive(Debug, Clone)]
46pub struct UrlEntry {
47    url: String,
48    is_thumbnail: bool,
49}
50
51impl UrlEntry {
52    /// URL accessor.
53    pub fn url(&self) -> &str {
54        &self.url
55    }
56
57    /// Thumbnail flag accessor.
58    pub fn is_thumbnail(&self) -> bool {
59        self.is_thumbnail
60    }
61}
62
63/// Collect thumbnail URLs from registry entities (actors/institutions).
64pub fn collect_registry_urls(reg: &crate::registry::EntityRegistry) -> Vec<UrlEntry> {
65    let mut urls = Vec::new();
66    let mut seen = std::collections::HashSet::new();
67
68    for name in reg.names() {
69        if let Some(entry) = reg.get_by_name(name) {
70            for (key, value) in &entry.entity.fields {
71                if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
72                    && let FieldValue::Single(url) = value
73                    && !url.is_empty()
74                    && seen.insert(url.clone())
75                {
76                    urls.push(UrlEntry {
77                        url: url.clone(),
78                        is_thumbnail: true,
79                    });
80                }
81            }
82        }
83    }
84
85    urls
86}
87
88/// Collect all URLs from parsed case data for verification.
89pub fn collect_urls(
90    sources: &[String],
91    entities: &[Entity],
92    rels: &[Rel],
93    errors: &mut Vec<ParseError>,
94) -> Vec<UrlEntry> {
95    let mut urls = Vec::new();
96
97    // Front matter sources
98    for url in sources {
99        urls.push(UrlEntry {
100            url: url.clone(),
101            is_thumbnail: false,
102        });
103    }
104
105    // Entity URLs and thumbnails
106    for entity in entities {
107        for (key, value) in &entity.fields {
108            match key.as_str() {
109                "thumbnail" | "thumbnail_source" => {
110                    if let FieldValue::Single(url) = value
111                        && !url.is_empty()
112                    {
113                        urls.push(UrlEntry {
114                            url: url.clone(),
115                            is_thumbnail: true,
116                        });
117                    }
118                }
119                "urls" => {
120                    if let FieldValue::List(items) = value {
121                        for url in items {
122                            urls.push(UrlEntry {
123                                url: url.clone(),
124                                is_thumbnail: false,
125                            });
126                        }
127                    }
128                }
129                _ => {}
130            }
131        }
132    }
133
134    // Relationship source URL overrides
135    for rel in rels {
136        for url in &rel.source_urls {
137            urls.push(UrlEntry {
138                url: url.clone(),
139                is_thumbnail: false,
140            });
141        }
142    }
143
144    // Deduplicate by URL
145    let mut seen = std::collections::HashSet::new();
146    urls.retain(|entry| seen.insert(entry.url.clone()));
147
148    // Boundary check
149    if urls.len() > MAX_URLS_PER_RUN {
150        errors.push(ParseError {
151            line: 0,
152            message: format!(
153                "too many URLs to verify (max {MAX_URLS_PER_RUN}, got {})",
154                urls.len()
155            ),
156        });
157    }
158
159    urls
160}
161
162/// Verify all collected URLs concurrently.
163pub async fn verify_urls(
164    urls: Vec<UrlEntry>,
165    concurrency: usize,
166    timeout_secs: u64,
167) -> Vec<UrlCheck> {
168    let client = reqwest::Client::builder()
169        .user_agent(USER_AGENT)
170        .redirect(reqwest::redirect::Policy::limited(MAX_REDIRECTS))
171        .timeout(Duration::from_secs(timeout_secs))
172        .build()
173        .unwrap_or_else(|_| reqwest::Client::new());
174
175    let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(concurrency));
176    let client = std::sync::Arc::new(client);
177
178    let mut handles = Vec::new();
179
180    for entry in urls {
181        let sem = semaphore.clone();
182        let cli = client.clone();
183        handles.push(tokio::spawn(async move {
184            let _permit = sem.acquire().await;
185            check_url(&cli, &entry.url, entry.is_thumbnail).await
186        }));
187    }
188
189    let mut results = Vec::new();
190    for handle in handles {
191        match handle.await {
192            Ok(check) => results.push(check),
193            Err(e) => results.push(UrlCheck {
194                url: "unknown".into(),
195                status: CheckStatus::Error,
196                detail: Some(format!("task panicked: {e}")),
197                is_thumbnail: false,
198            }),
199        }
200    }
201
202    results
203}
204
205async fn check_url(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
206    // Try HEAD first
207    match client.head(url).send().await {
208        Ok(resp) => {
209            let status = resp.status();
210
211            // If HEAD returns 405, try GET
212            if status == reqwest::StatusCode::METHOD_NOT_ALLOWED {
213                return check_url_get(client, url, is_thumbnail).await;
214            }
215
216            evaluate_response(url, status, resp.headers(), is_thumbnail)
217        }
218        Err(e) => {
219            if e.is_timeout() {
220                UrlCheck {
221                    url: url.to_string(),
222                    status: CheckStatus::Warn,
223                    detail: Some("timeout".into()),
224                    is_thumbnail,
225                }
226            } else {
227                UrlCheck {
228                    url: url.to_string(),
229                    status: CheckStatus::Error,
230                    detail: Some(format!("{e}")),
231                    is_thumbnail,
232                }
233            }
234        }
235    }
236}
237
238async fn check_url_get(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
239    match client.get(url).send().await {
240        Ok(resp) => evaluate_response(url, resp.status(), resp.headers(), is_thumbnail),
241        Err(e) => {
242            if e.is_timeout() {
243                UrlCheck {
244                    url: url.to_string(),
245                    status: CheckStatus::Warn,
246                    detail: Some("timeout".into()),
247                    is_thumbnail,
248                }
249            } else {
250                UrlCheck {
251                    url: url.to_string(),
252                    status: CheckStatus::Error,
253                    detail: Some(format!("{e}")),
254                    is_thumbnail,
255                }
256            }
257        }
258    }
259}
260
261fn evaluate_response(
262    url: &str,
263    status: reqwest::StatusCode,
264    headers: &reqwest::header::HeaderMap,
265    is_thumbnail: bool,
266) -> UrlCheck {
267    if status.is_success() {
268        // Check thumbnail content-type
269        if is_thumbnail && let Some(ct) = headers.get(reqwest::header::CONTENT_TYPE) {
270            let ct_str = ct.to_str().unwrap_or("");
271            if !ct_str.starts_with("image/") {
272                return UrlCheck {
273                    url: url.to_string(),
274                    status: CheckStatus::Error,
275                    detail: Some(format!("expected content-type image/*, got {ct_str}")),
276                    is_thumbnail,
277                };
278            }
279        }
280
281        UrlCheck {
282            url: url.to_string(),
283            status: CheckStatus::Ok,
284            detail: None,
285            is_thumbnail,
286        }
287    } else if status.is_redirection() {
288        // Redirect not followed (should be handled by client policy)
289        UrlCheck {
290            url: url.to_string(),
291            status: CheckStatus::Warn,
292            detail: Some(format!("HTTP {status}")),
293            is_thumbnail,
294        }
295    } else {
296        UrlCheck {
297            url: url.to_string(),
298            status: CheckStatus::Error,
299            detail: Some(format!("HTTP {status}")),
300            is_thumbnail,
301        }
302    }
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308
309    #[test]
310    fn collect_urls_deduplicates() {
311        let sources = vec!["https://a.com".into(), "https://b.com".into()];
312        let entities = vec![Entity {
313            name: "Test".into(),
314            label: crate::entity::Label::Actor,
315            fields: vec![(
316                "urls".into(),
317                FieldValue::List(vec!["https://a.com".into(), "https://c.com".into()]),
318            )],
319            id: None,
320            line: 1,
321        }];
322        let mut errors = Vec::new();
323
324        let urls = collect_urls(&sources, &entities, &[], &mut errors);
325        assert!(errors.is_empty());
326        // a.com deduplicated
327        assert_eq!(urls.len(), 3);
328    }
329
330    #[test]
331    fn collect_urls_includes_thumbnails() {
332        let entities = vec![Entity {
333            name: "Test".into(),
334            label: crate::entity::Label::Actor,
335            fields: vec![(
336                "thumbnail".into(),
337                FieldValue::Single("https://img.com/photo.jpg".into()),
338            )],
339            id: None,
340            line: 1,
341        }];
342        let mut errors = Vec::new();
343
344        let urls = collect_urls(&[], &entities, &[], &mut errors);
345        assert_eq!(urls.len(), 1);
346        assert!(urls[0].is_thumbnail);
347    }
348
349    #[test]
350    fn collect_urls_includes_rel_sources() {
351        let rels = vec![Rel {
352            source_name: "A".into(),
353            target_name: "B".into(),
354            rel_type: "related_to".into(),
355            source_urls: vec!["https://src.com".into()],
356            fields: vec![],
357            id: None,
358            line: 1,
359        }];
360        let mut errors = Vec::new();
361
362        let urls = collect_urls(&[], &[], &rels, &mut errors);
363        assert_eq!(urls.len(), 1);
364        assert!(!urls[0].is_thumbnail);
365    }
366
367    #[test]
368    fn collect_urls_boundary() {
369        let sources: Vec<String> = (0..501)
370            .map(|i| format!("https://example.com/{i}"))
371            .collect();
372        let mut errors = Vec::new();
373
374        collect_urls(&sources, &[], &[], &mut errors);
375        assert!(errors.iter().any(|e| e.message.contains("too many URLs")));
376    }
377
378    #[test]
379    fn evaluate_success() {
380        let check = evaluate_response(
381            "https://example.com",
382            reqwest::StatusCode::OK,
383            &reqwest::header::HeaderMap::new(),
384            false,
385        );
386        assert_eq!(check.status, CheckStatus::Ok);
387    }
388
389    #[test]
390    fn evaluate_not_found() {
391        let check = evaluate_response(
392            "https://example.com",
393            reqwest::StatusCode::NOT_FOUND,
394            &reqwest::header::HeaderMap::new(),
395            false,
396        );
397        assert_eq!(check.status, CheckStatus::Error);
398    }
399
400    #[test]
401    fn evaluate_thumbnail_wrong_content_type() {
402        let mut headers = reqwest::header::HeaderMap::new();
403        headers.insert(
404            reqwest::header::CONTENT_TYPE,
405            "text/html".parse().unwrap_or_else(|_| unreachable!()),
406        );
407        let check = evaluate_response(
408            "https://example.com/img.jpg",
409            reqwest::StatusCode::OK,
410            &headers,
411            true,
412        );
413        assert_eq!(check.status, CheckStatus::Error);
414        assert!(check.detail.as_deref().unwrap_or("").contains("image/*"));
415    }
416
417    #[test]
418    fn evaluate_thumbnail_correct_content_type() {
419        let mut headers = reqwest::header::HeaderMap::new();
420        headers.insert(
421            reqwest::header::CONTENT_TYPE,
422            "image/jpeg".parse().unwrap_or_else(|_| unreachable!()),
423        );
424        let check = evaluate_response(
425            "https://example.com/img.jpg",
426            reqwest::StatusCode::OK,
427            &headers,
428            true,
429        );
430        assert_eq!(check.status, CheckStatus::Ok);
431    }
432
433    #[tokio::test]
434    async fn verify_urls_with_mock_server_ok() {
435        let mut server = mockito::Server::new_async().await;
436        let mock = server
437            .mock("HEAD", "/page")
438            .with_status(200)
439            .create_async()
440            .await;
441
442        let urls = vec![UrlEntry {
443            url: format!("{}/page", server.url()),
444            is_thumbnail: false,
445        }];
446
447        let results = verify_urls(urls, 4, 5).await;
448        assert_eq!(results.len(), 1);
449        assert_eq!(results[0].status, CheckStatus::Ok);
450        mock.assert_async().await;
451    }
452
453    #[tokio::test]
454    async fn verify_urls_with_mock_server_404() {
455        let mut server = mockito::Server::new_async().await;
456        let mock = server
457            .mock("HEAD", "/missing")
458            .with_status(404)
459            .create_async()
460            .await;
461
462        let urls = vec![UrlEntry {
463            url: format!("{}/missing", server.url()),
464            is_thumbnail: false,
465        }];
466
467        let results = verify_urls(urls, 4, 5).await;
468        assert_eq!(results.len(), 1);
469        assert_eq!(results[0].status, CheckStatus::Error);
470        assert!(results[0].detail.as_deref().unwrap_or("").contains("404"));
471        mock.assert_async().await;
472    }
473
474    #[tokio::test]
475    async fn verify_urls_head_405_falls_back_to_get() {
476        let mut server = mockito::Server::new_async().await;
477        let head_mock = server
478            .mock("HEAD", "/no-head")
479            .with_status(405)
480            .create_async()
481            .await;
482        let get_mock = server
483            .mock("GET", "/no-head")
484            .with_status(200)
485            .create_async()
486            .await;
487
488        let urls = vec![UrlEntry {
489            url: format!("{}/no-head", server.url()),
490            is_thumbnail: false,
491        }];
492
493        let results = verify_urls(urls, 4, 5).await;
494        assert_eq!(results.len(), 1);
495        assert_eq!(results[0].status, CheckStatus::Ok);
496        head_mock.assert_async().await;
497        get_mock.assert_async().await;
498    }
499
500    #[tokio::test]
501    async fn verify_urls_thumbnail_content_type_check() {
502        let mut server = mockito::Server::new_async().await;
503        let mock = server
504            .mock("HEAD", "/img.jpg")
505            .with_status(200)
506            .with_header("content-type", "image/jpeg")
507            .create_async()
508            .await;
509
510        let urls = vec![UrlEntry {
511            url: format!("{}/img.jpg", server.url()),
512            is_thumbnail: true,
513        }];
514
515        let results = verify_urls(urls, 4, 5).await;
516        assert_eq!(results.len(), 1);
517        assert_eq!(results[0].status, CheckStatus::Ok);
518        mock.assert_async().await;
519    }
520
521    #[tokio::test]
522    async fn verify_urls_thumbnail_wrong_content_type() {
523        let mut server = mockito::Server::new_async().await;
524        let mock = server
525            .mock("HEAD", "/not-image")
526            .with_status(200)
527            .with_header("content-type", "text/html")
528            .create_async()
529            .await;
530
531        let urls = vec![UrlEntry {
532            url: format!("{}/not-image", server.url()),
533            is_thumbnail: true,
534        }];
535
536        let results = verify_urls(urls, 4, 5).await;
537        assert_eq!(results.len(), 1);
538        assert_eq!(results[0].status, CheckStatus::Error);
539        assert!(
540            results[0]
541                .detail
542                .as_deref()
543                .unwrap_or("")
544                .contains("image/*")
545        );
546        mock.assert_async().await;
547    }
548}