Skip to main content

weave_content/
verifier.rs

1use std::time::Duration;
2
3use crate::entity::{Entity, FieldValue};
4use crate::parser::ParseError;
5use crate::relationship::Rel;
6
7/// Maximum total URLs per verify run.
8const MAX_URLS_PER_RUN: usize = 2_000;
9
10/// Maximum redirect hops.
11const MAX_REDIRECTS: usize = 5;
12
13/// User-Agent header.
14const USER_AGENT: &str = "weave-content/0.2 (+https://github.com/redberrythread/weave)";
15
16/// Result of checking a single URL.
17#[derive(Debug)]
18pub struct UrlCheck {
19    pub url: String,
20    pub status: CheckStatus,
21    pub detail: Option<String>,
22    /// Whether this URL is a thumbnail (needs content-type check).
23    pub is_thumbnail: bool,
24}
25
26/// Severity level of a URL check result.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum CheckStatus {
29    Ok,
30    Warn,
31    Error,
32}
33
34impl std::fmt::Display for CheckStatus {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        match self {
37            Self::Ok => write!(f, "ok"),
38            Self::Warn => write!(f, "warn"),
39            Self::Error => write!(f, "error"),
40        }
41    }
42}
43
44/// Collected URL to verify with its source location.
45#[derive(Debug, Clone)]
46pub struct UrlEntry {
47    url: String,
48    is_thumbnail: bool,
49}
50
51impl UrlEntry {
52    /// URL accessor.
53    pub fn url(&self) -> &str {
54        &self.url
55    }
56
57    /// Thumbnail flag accessor.
58    pub fn is_thumbnail(&self) -> bool {
59        self.is_thumbnail
60    }
61}
62
63/// Collect thumbnail URLs from registry entities (people/organizations).
64pub fn collect_registry_urls(reg: &crate::registry::EntityRegistry) -> Vec<UrlEntry> {
65    let mut urls = Vec::new();
66    let mut seen = std::collections::HashSet::new();
67
68    for name in reg.names() {
69        if let Some(entry) = reg.get_by_name(name) {
70            for (key, value) in &entry.entity.fields {
71                if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
72                    && let FieldValue::Single(url) = value
73                    && !url.is_empty()
74                    && seen.insert(url.clone())
75                {
76                    urls.push(UrlEntry {
77                        url: url.clone(),
78                        is_thumbnail: true,
79                    });
80                }
81            }
82        }
83    }
84
85    urls
86}
87
88/// Collect all URLs from parsed case data for verification.
89pub fn collect_urls(
90    sources: &[crate::parser::SourceEntry],
91    entities: &[Entity],
92    rels: &[Rel],
93    errors: &mut Vec<ParseError>,
94) -> Vec<UrlEntry> {
95    let mut urls = Vec::new();
96
97    // Front matter sources
98    for source in sources {
99        urls.push(UrlEntry {
100            url: source.url().to_string(),
101            is_thumbnail: false,
102        });
103    }
104
105    // Entity URLs and thumbnails
106    for entity in entities {
107        for (key, value) in &entity.fields {
108            match key.as_str() {
109                "thumbnail" | "thumbnail_source" => {
110                    if let FieldValue::Single(url) = value
111                        && !url.is_empty()
112                    {
113                        urls.push(UrlEntry {
114                            url: url.clone(),
115                            is_thumbnail: true,
116                        });
117                    }
118                }
119                "urls" => {
120                    if let FieldValue::List(items) = value {
121                        for url in items {
122                            urls.push(UrlEntry {
123                                url: url.clone(),
124                                is_thumbnail: false,
125                            });
126                        }
127                    }
128                }
129                _ => {}
130            }
131        }
132    }
133
134    // Relationship source URL overrides
135    for rel in rels {
136        for url in &rel.source_urls {
137            urls.push(UrlEntry {
138                url: url.clone(),
139                is_thumbnail: false,
140            });
141        }
142    }
143
144    // Deduplicate by URL
145    let mut seen = std::collections::HashSet::new();
146    urls.retain(|entry| seen.insert(entry.url.clone()));
147
148    // Boundary check
149    if urls.len() > MAX_URLS_PER_RUN {
150        errors.push(ParseError {
151            line: 0,
152            message: format!(
153                "too many URLs to verify (max {MAX_URLS_PER_RUN}, got {})",
154                urls.len()
155            ),
156        });
157    }
158
159    urls
160}
161
162/// Verify all collected URLs concurrently.
163pub async fn verify_urls(
164    urls: Vec<UrlEntry>,
165    concurrency: usize,
166    timeout_secs: u64,
167) -> Vec<UrlCheck> {
168    let client = reqwest::Client::builder()
169        .user_agent(USER_AGENT)
170        .redirect(reqwest::redirect::Policy::limited(MAX_REDIRECTS))
171        .timeout(Duration::from_secs(timeout_secs))
172        .build()
173        .unwrap_or_else(|_| reqwest::Client::new());
174
175    let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(concurrency));
176    let client = std::sync::Arc::new(client);
177
178    let mut handles = Vec::new();
179
180    for entry in urls {
181        let sem = semaphore.clone();
182        let cli = client.clone();
183        handles.push(tokio::spawn(async move {
184            let _permit = sem.acquire().await;
185            check_url(&cli, &entry.url, entry.is_thumbnail).await
186        }));
187    }
188
189    let mut results = Vec::new();
190    for handle in handles {
191        match handle.await {
192            Ok(check) => results.push(check),
193            Err(e) => results.push(UrlCheck {
194                url: "unknown".into(),
195                status: CheckStatus::Error,
196                detail: Some(format!("task panicked: {e}")),
197                is_thumbnail: false,
198            }),
199        }
200    }
201
202    results
203}
204
205async fn check_url(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
206    // Try HEAD first
207    match client.head(url).send().await {
208        Ok(resp) => {
209            let status = resp.status();
210
211            // If HEAD returns 405, try GET
212            if status == reqwest::StatusCode::METHOD_NOT_ALLOWED {
213                return check_url_get(client, url, is_thumbnail).await;
214            }
215
216            evaluate_response(url, status, resp.headers(), is_thumbnail)
217        }
218        Err(e) => {
219            if e.is_timeout() {
220                UrlCheck {
221                    url: url.to_string(),
222                    status: CheckStatus::Warn,
223                    detail: Some("timeout".into()),
224                    is_thumbnail,
225                }
226            } else {
227                UrlCheck {
228                    url: url.to_string(),
229                    status: CheckStatus::Error,
230                    detail: Some(format!("{e}")),
231                    is_thumbnail,
232                }
233            }
234        }
235    }
236}
237
238async fn check_url_get(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
239    match client.get(url).send().await {
240        Ok(resp) => evaluate_response(url, resp.status(), resp.headers(), is_thumbnail),
241        Err(e) => {
242            if e.is_timeout() {
243                UrlCheck {
244                    url: url.to_string(),
245                    status: CheckStatus::Warn,
246                    detail: Some("timeout".into()),
247                    is_thumbnail,
248                }
249            } else {
250                UrlCheck {
251                    url: url.to_string(),
252                    status: CheckStatus::Error,
253                    detail: Some(format!("{e}")),
254                    is_thumbnail,
255                }
256            }
257        }
258    }
259}
260
261fn evaluate_response(
262    url: &str,
263    status: reqwest::StatusCode,
264    headers: &reqwest::header::HeaderMap,
265    is_thumbnail: bool,
266) -> UrlCheck {
267    if status.is_success() {
268        // Check thumbnail content-type
269        if is_thumbnail && let Some(ct) = headers.get(reqwest::header::CONTENT_TYPE) {
270            let ct_str = ct.to_str().unwrap_or("");
271            if !ct_str.starts_with("image/") {
272                return UrlCheck {
273                    url: url.to_string(),
274                    status: CheckStatus::Error,
275                    detail: Some(format!("expected content-type image/*, got {ct_str}")),
276                    is_thumbnail,
277                };
278            }
279        }
280
281        UrlCheck {
282            url: url.to_string(),
283            status: CheckStatus::Ok,
284            detail: None,
285            is_thumbnail,
286        }
287    } else if status.is_redirection() {
288        // Redirect not followed (should be handled by client policy)
289        UrlCheck {
290            url: url.to_string(),
291            status: CheckStatus::Warn,
292            detail: Some(format!("HTTP {status}")),
293            is_thumbnail,
294        }
295    } else {
296        UrlCheck {
297            url: url.to_string(),
298            status: CheckStatus::Error,
299            detail: Some(format!("HTTP {status}")),
300            is_thumbnail,
301        }
302    }
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308
309    #[test]
310    fn collect_urls_deduplicates() {
311        let sources = vec![
312            crate::parser::SourceEntry::Url("https://a.com".into()),
313            crate::parser::SourceEntry::Url("https://b.com".into()),
314        ];
315        let entities = vec![Entity {
316            name: "Test".into(),
317            label: crate::entity::Label::Person,
318            fields: vec![(
319                "urls".into(),
320                FieldValue::List(vec!["https://a.com".into(), "https://c.com".into()]),
321            )],
322            id: None,
323            line: 1,
324            tags: Vec::new(),
325            slug: None,
326        }];
327        let mut errors = Vec::new();
328
329        let urls = collect_urls(&sources, &entities, &[], &mut errors);
330        assert!(errors.is_empty());
331        // a.com deduplicated
332        assert_eq!(urls.len(), 3);
333    }
334
335    #[test]
336    fn collect_urls_includes_thumbnails() {
337        let entities = vec![Entity {
338            name: "Test".into(),
339            label: crate::entity::Label::Person,
340            fields: vec![(
341                "thumbnail".into(),
342                FieldValue::Single("https://img.com/photo.jpg".into()),
343            )],
344            id: None,
345            line: 1,
346            tags: Vec::new(),
347            slug: None,
348        }];
349        let mut errors = Vec::new();
350
351        let urls = collect_urls(&[], &entities, &[], &mut errors);
352        assert_eq!(urls.len(), 1);
353        assert!(urls[0].is_thumbnail);
354    }
355
356    #[test]
357    fn collect_urls_includes_rel_sources() {
358        let rels = vec![Rel {
359            source_name: "A".into(),
360            target_name: "B".into(),
361            rel_type: "associate_of".into(),
362            source_urls: vec!["https://src.com".into()],
363            fields: vec![],
364            id: None,
365            line: 1,
366        }];
367        let mut errors = Vec::new();
368
369        let urls = collect_urls(&[], &[], &rels, &mut errors);
370        assert_eq!(urls.len(), 1);
371        assert!(!urls[0].is_thumbnail);
372    }
373
374    #[test]
375    fn collect_urls_boundary() {
376        let sources: Vec<crate::parser::SourceEntry> = (0..2_001)
377            .map(|i| crate::parser::SourceEntry::Url(format!("https://example.com/{i}")))
378            .collect();
379        let mut errors = Vec::new();
380
381        collect_urls(&sources, &[], &[], &mut errors);
382        assert!(errors.iter().any(|e| e.message.contains("too many URLs")));
383    }
384
385    #[test]
386    fn evaluate_success() {
387        let check = evaluate_response(
388            "https://example.com",
389            reqwest::StatusCode::OK,
390            &reqwest::header::HeaderMap::new(),
391            false,
392        );
393        assert_eq!(check.status, CheckStatus::Ok);
394    }
395
396    #[test]
397    fn evaluate_not_found() {
398        let check = evaluate_response(
399            "https://example.com",
400            reqwest::StatusCode::NOT_FOUND,
401            &reqwest::header::HeaderMap::new(),
402            false,
403        );
404        assert_eq!(check.status, CheckStatus::Error);
405    }
406
407    #[test]
408    fn evaluate_thumbnail_wrong_content_type() {
409        let mut headers = reqwest::header::HeaderMap::new();
410        headers.insert(
411            reqwest::header::CONTENT_TYPE,
412            "text/html".parse().unwrap_or_else(|_| unreachable!()),
413        );
414        let check = evaluate_response(
415            "https://example.com/img.jpg",
416            reqwest::StatusCode::OK,
417            &headers,
418            true,
419        );
420        assert_eq!(check.status, CheckStatus::Error);
421        assert!(check.detail.as_deref().unwrap_or("").contains("image/*"));
422    }
423
424    #[test]
425    fn evaluate_thumbnail_correct_content_type() {
426        let mut headers = reqwest::header::HeaderMap::new();
427        headers.insert(
428            reqwest::header::CONTENT_TYPE,
429            "image/jpeg".parse().unwrap_or_else(|_| unreachable!()),
430        );
431        let check = evaluate_response(
432            "https://example.com/img.jpg",
433            reqwest::StatusCode::OK,
434            &headers,
435            true,
436        );
437        assert_eq!(check.status, CheckStatus::Ok);
438    }
439
440    #[tokio::test]
441    async fn verify_urls_with_mock_server_ok() {
442        let mut server = mockito::Server::new_async().await;
443        let mock = server
444            .mock("HEAD", "/page")
445            .with_status(200)
446            .create_async()
447            .await;
448
449        let urls = vec![UrlEntry {
450            url: format!("{}/page", server.url()),
451            is_thumbnail: false,
452        }];
453
454        let results = verify_urls(urls, 4, 5).await;
455        assert_eq!(results.len(), 1);
456        assert_eq!(results[0].status, CheckStatus::Ok);
457        mock.assert_async().await;
458    }
459
460    #[tokio::test]
461    async fn verify_urls_with_mock_server_404() {
462        let mut server = mockito::Server::new_async().await;
463        let mock = server
464            .mock("HEAD", "/missing")
465            .with_status(404)
466            .create_async()
467            .await;
468
469        let urls = vec![UrlEntry {
470            url: format!("{}/missing", server.url()),
471            is_thumbnail: false,
472        }];
473
474        let results = verify_urls(urls, 4, 5).await;
475        assert_eq!(results.len(), 1);
476        assert_eq!(results[0].status, CheckStatus::Error);
477        assert!(results[0].detail.as_deref().unwrap_or("").contains("404"));
478        mock.assert_async().await;
479    }
480
481    #[tokio::test]
482    async fn verify_urls_head_405_falls_back_to_get() {
483        let mut server = mockito::Server::new_async().await;
484        let head_mock = server
485            .mock("HEAD", "/no-head")
486            .with_status(405)
487            .create_async()
488            .await;
489        let get_mock = server
490            .mock("GET", "/no-head")
491            .with_status(200)
492            .create_async()
493            .await;
494
495        let urls = vec![UrlEntry {
496            url: format!("{}/no-head", server.url()),
497            is_thumbnail: false,
498        }];
499
500        let results = verify_urls(urls, 4, 5).await;
501        assert_eq!(results.len(), 1);
502        assert_eq!(results[0].status, CheckStatus::Ok);
503        head_mock.assert_async().await;
504        get_mock.assert_async().await;
505    }
506
507    #[tokio::test]
508    async fn verify_urls_thumbnail_content_type_check() {
509        let mut server = mockito::Server::new_async().await;
510        let mock = server
511            .mock("HEAD", "/img.jpg")
512            .with_status(200)
513            .with_header("content-type", "image/jpeg")
514            .create_async()
515            .await;
516
517        let urls = vec![UrlEntry {
518            url: format!("{}/img.jpg", server.url()),
519            is_thumbnail: true,
520        }];
521
522        let results = verify_urls(urls, 4, 5).await;
523        assert_eq!(results.len(), 1);
524        assert_eq!(results[0].status, CheckStatus::Ok);
525        mock.assert_async().await;
526    }
527
528    #[tokio::test]
529    async fn verify_urls_thumbnail_wrong_content_type() {
530        let mut server = mockito::Server::new_async().await;
531        let mock = server
532            .mock("HEAD", "/not-image")
533            .with_status(200)
534            .with_header("content-type", "text/html")
535            .create_async()
536            .await;
537
538        let urls = vec![UrlEntry {
539            url: format!("{}/not-image", server.url()),
540            is_thumbnail: true,
541        }];
542
543        let results = verify_urls(urls, 4, 5).await;
544        assert_eq!(results.len(), 1);
545        assert_eq!(results[0].status, CheckStatus::Error);
546        assert!(
547            results[0]
548                .detail
549                .as_deref()
550                .unwrap_or("")
551                .contains("image/*")
552        );
553        mock.assert_async().await;
554    }
555}