Skip to main content

weave_content/
verifier.rs

1use std::time::Duration;
2
3use crate::entity::{Entity, FieldValue};
4use crate::parser::ParseError;
5use crate::relationship::Rel;
6
7/// Maximum total URLs per verify run.
8const MAX_URLS_PER_RUN: usize = 2_000;
9
10/// Maximum redirect hops.
11const MAX_REDIRECTS: usize = 5;
12
13/// User-Agent header.
14const USER_AGENT: &str = "weave-content/0.2 (+https://github.com/redberrythread/weave)";
15
16/// Result of checking a single URL.
17#[derive(Debug)]
18pub struct UrlCheck {
19    pub url: String,
20    pub status: CheckStatus,
21    pub detail: Option<String>,
22    /// Whether this URL is a thumbnail (needs content-type check).
23    pub is_thumbnail: bool,
24}
25
26/// Severity level of a URL check result.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum CheckStatus {
29    Ok,
30    Warn,
31    Error,
32}
33
34impl std::fmt::Display for CheckStatus {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        match self {
37            Self::Ok => write!(f, "ok"),
38            Self::Warn => write!(f, "warn"),
39            Self::Error => write!(f, "error"),
40        }
41    }
42}
43
44/// Collected URL to verify with its source location.
45#[derive(Debug, Clone)]
46pub struct UrlEntry {
47    url: String,
48    is_thumbnail: bool,
49}
50
51impl UrlEntry {
52    /// URL accessor.
53    pub fn url(&self) -> &str {
54        &self.url
55    }
56
57    /// Thumbnail flag accessor.
58    pub fn is_thumbnail(&self) -> bool {
59        self.is_thumbnail
60    }
61}
62
63/// Collect thumbnail URLs from registry entities (people/organizations).
64pub fn collect_registry_urls(reg: &crate::registry::EntityRegistry) -> Vec<UrlEntry> {
65    let mut urls = Vec::new();
66    let mut seen = std::collections::HashSet::new();
67
68    for name in reg.names() {
69        if let Some(entry) = reg.get_by_name(name) {
70            for (key, value) in &entry.entity.fields {
71                if matches!(key.as_str(), "thumbnail" | "thumbnail_source")
72                    && let FieldValue::Single(url) = value
73                    && !url.is_empty()
74                    && seen.insert(url.clone())
75                {
76                    urls.push(UrlEntry {
77                        url: url.clone(),
78                        is_thumbnail: true,
79                    });
80                }
81            }
82        }
83    }
84
85    urls
86}
87
88/// Collect all URLs from parsed case data for verification.
89pub fn collect_urls(
90    sources: &[crate::parser::SourceEntry],
91    entities: &[Entity],
92    rels: &[Rel],
93    errors: &mut Vec<ParseError>,
94) -> Vec<UrlEntry> {
95    let mut urls = Vec::new();
96
97    // Front matter sources
98    for source in sources {
99        urls.push(UrlEntry {
100            url: source.url().to_string(),
101            is_thumbnail: false,
102        });
103    }
104
105    // Entity URLs and thumbnails
106    for entity in entities {
107        for (key, value) in &entity.fields {
108            match key.as_str() {
109                "thumbnail" | "thumbnail_source" => {
110                    if let FieldValue::Single(url) = value
111                        && !url.is_empty()
112                    {
113                        urls.push(UrlEntry {
114                            url: url.clone(),
115                            is_thumbnail: true,
116                        });
117                    }
118                }
119                "urls" => {
120                    if let FieldValue::List(items) = value {
121                        for url in items {
122                            urls.push(UrlEntry {
123                                url: url.clone(),
124                                is_thumbnail: false,
125                            });
126                        }
127                    }
128                }
129                _ => {}
130            }
131        }
132    }
133
134    // Relationship source URL overrides
135    for rel in rels {
136        for url in &rel.source_urls {
137            urls.push(UrlEntry {
138                url: url.clone(),
139                is_thumbnail: false,
140            });
141        }
142    }
143
144    // Deduplicate by URL
145    let mut seen = std::collections::HashSet::new();
146    urls.retain(|entry| seen.insert(entry.url.clone()));
147
148    // Boundary check
149    if urls.len() > MAX_URLS_PER_RUN {
150        errors.push(ParseError {
151            line: 0,
152            message: format!(
153                "too many URLs to verify (max {MAX_URLS_PER_RUN}, got {})",
154                urls.len()
155            ),
156        });
157    }
158
159    urls
160}
161
162/// Verify all collected URLs concurrently.
163pub async fn verify_urls(
164    urls: Vec<UrlEntry>,
165    concurrency: usize,
166    timeout_secs: u64,
167) -> Vec<UrlCheck> {
168    let client = reqwest::Client::builder()
169        .user_agent(USER_AGENT)
170        .redirect(reqwest::redirect::Policy::limited(MAX_REDIRECTS))
171        .timeout(Duration::from_secs(timeout_secs))
172        .build()
173        .unwrap_or_else(|_| reqwest::Client::new());
174
175    let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(concurrency));
176    let client = std::sync::Arc::new(client);
177
178    let mut handles = Vec::new();
179
180    for entry in urls {
181        let sem = semaphore.clone();
182        let cli = client.clone();
183        handles.push(tokio::spawn(async move {
184            let _permit = sem.acquire().await;
185            check_url(&cli, &entry.url, entry.is_thumbnail).await
186        }));
187    }
188
189    let mut results = Vec::new();
190    for handle in handles {
191        match handle.await {
192            Ok(check) => results.push(check),
193            Err(e) => results.push(UrlCheck {
194                url: "unknown".into(),
195                status: CheckStatus::Error,
196                detail: Some(format!("task panicked: {e}")),
197                is_thumbnail: false,
198            }),
199        }
200    }
201
202    results
203}
204
205async fn check_url(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
206    // Try HEAD first
207    match client.head(url).send().await {
208        Ok(resp) => {
209            let status = resp.status();
210
211            // If HEAD returns 405, try GET
212            if status == reqwest::StatusCode::METHOD_NOT_ALLOWED {
213                return check_url_get(client, url, is_thumbnail).await;
214            }
215
216            evaluate_response(url, status, resp.headers(), is_thumbnail)
217        }
218        Err(e) => {
219            if e.is_timeout() {
220                UrlCheck {
221                    url: url.to_string(),
222                    status: CheckStatus::Warn,
223                    detail: Some("timeout".into()),
224                    is_thumbnail,
225                }
226            } else {
227                UrlCheck {
228                    url: url.to_string(),
229                    status: CheckStatus::Error,
230                    detail: Some(format!("{e}")),
231                    is_thumbnail,
232                }
233            }
234        }
235    }
236}
237
238async fn check_url_get(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
239    match client.get(url).send().await {
240        Ok(resp) => evaluate_response(url, resp.status(), resp.headers(), is_thumbnail),
241        Err(e) => {
242            if e.is_timeout() {
243                UrlCheck {
244                    url: url.to_string(),
245                    status: CheckStatus::Warn,
246                    detail: Some("timeout".into()),
247                    is_thumbnail,
248                }
249            } else {
250                UrlCheck {
251                    url: url.to_string(),
252                    status: CheckStatus::Error,
253                    detail: Some(format!("{e}")),
254                    is_thumbnail,
255                }
256            }
257        }
258    }
259}
260
261fn evaluate_response(
262    url: &str,
263    status: reqwest::StatusCode,
264    headers: &reqwest::header::HeaderMap,
265    is_thumbnail: bool,
266) -> UrlCheck {
267    if status.is_success() {
268        // Check thumbnail content-type
269        if is_thumbnail && let Some(ct) = headers.get(reqwest::header::CONTENT_TYPE) {
270            let ct_str = ct.to_str().unwrap_or("");
271            if !ct_str.starts_with("image/") {
272                return UrlCheck {
273                    url: url.to_string(),
274                    status: CheckStatus::Error,
275                    detail: Some(format!("expected content-type image/*, got {ct_str}")),
276                    is_thumbnail,
277                };
278            }
279        }
280
281        UrlCheck {
282            url: url.to_string(),
283            status: CheckStatus::Ok,
284            detail: None,
285            is_thumbnail,
286        }
287    } else if status.is_redirection() {
288        // Redirect not followed (should be handled by client policy)
289        UrlCheck {
290            url: url.to_string(),
291            status: CheckStatus::Warn,
292            detail: Some(format!("HTTP {status}")),
293            is_thumbnail,
294        }
295    } else {
296        UrlCheck {
297            url: url.to_string(),
298            status: CheckStatus::Error,
299            detail: Some(format!("HTTP {status}")),
300            is_thumbnail,
301        }
302    }
303}
304
305#[cfg(test)]
306mod tests {
307    use super::*;
308
309    #[test]
310    fn collect_urls_deduplicates() {
311        let sources = vec![
312            crate::parser::SourceEntry::Url("https://a.com".into()),
313            crate::parser::SourceEntry::Url("https://b.com".into()),
314        ];
315        let entities = vec![Entity {
316            name: "Test".into(),
317            label: crate::entity::Label::Person,
318            fields: vec![(
319                "urls".into(),
320                FieldValue::List(vec!["https://a.com".into(), "https://c.com".into()]),
321            )],
322            id: None,
323            line: 1,
324            tags: Vec::new(),
325        }];
326        let mut errors = Vec::new();
327
328        let urls = collect_urls(&sources, &entities, &[], &mut errors);
329        assert!(errors.is_empty());
330        // a.com deduplicated
331        assert_eq!(urls.len(), 3);
332    }
333
334    #[test]
335    fn collect_urls_includes_thumbnails() {
336        let entities = vec![Entity {
337            name: "Test".into(),
338            label: crate::entity::Label::Person,
339            fields: vec![(
340                "thumbnail".into(),
341                FieldValue::Single("https://img.com/photo.jpg".into()),
342            )],
343            id: None,
344            line: 1,
345            tags: Vec::new(),
346        }];
347        let mut errors = Vec::new();
348
349        let urls = collect_urls(&[], &entities, &[], &mut errors);
350        assert_eq!(urls.len(), 1);
351        assert!(urls[0].is_thumbnail);
352    }
353
354    #[test]
355    fn collect_urls_includes_rel_sources() {
356        let rels = vec![Rel {
357            source_name: "A".into(),
358            target_name: "B".into(),
359            rel_type: "associate_of".into(),
360            source_urls: vec!["https://src.com".into()],
361            fields: vec![],
362            id: None,
363            line: 1,
364        }];
365        let mut errors = Vec::new();
366
367        let urls = collect_urls(&[], &[], &rels, &mut errors);
368        assert_eq!(urls.len(), 1);
369        assert!(!urls[0].is_thumbnail);
370    }
371
372    #[test]
373    fn collect_urls_boundary() {
374        let sources: Vec<crate::parser::SourceEntry> = (0..2_001)
375            .map(|i| crate::parser::SourceEntry::Url(format!("https://example.com/{i}")))
376            .collect();
377        let mut errors = Vec::new();
378
379        collect_urls(&sources, &[], &[], &mut errors);
380        assert!(errors.iter().any(|e| e.message.contains("too many URLs")));
381    }
382
383    #[test]
384    fn evaluate_success() {
385        let check = evaluate_response(
386            "https://example.com",
387            reqwest::StatusCode::OK,
388            &reqwest::header::HeaderMap::new(),
389            false,
390        );
391        assert_eq!(check.status, CheckStatus::Ok);
392    }
393
394    #[test]
395    fn evaluate_not_found() {
396        let check = evaluate_response(
397            "https://example.com",
398            reqwest::StatusCode::NOT_FOUND,
399            &reqwest::header::HeaderMap::new(),
400            false,
401        );
402        assert_eq!(check.status, CheckStatus::Error);
403    }
404
405    #[test]
406    fn evaluate_thumbnail_wrong_content_type() {
407        let mut headers = reqwest::header::HeaderMap::new();
408        headers.insert(
409            reqwest::header::CONTENT_TYPE,
410            "text/html".parse().unwrap_or_else(|_| unreachable!()),
411        );
412        let check = evaluate_response(
413            "https://example.com/img.jpg",
414            reqwest::StatusCode::OK,
415            &headers,
416            true,
417        );
418        assert_eq!(check.status, CheckStatus::Error);
419        assert!(check.detail.as_deref().unwrap_or("").contains("image/*"));
420    }
421
422    #[test]
423    fn evaluate_thumbnail_correct_content_type() {
424        let mut headers = reqwest::header::HeaderMap::new();
425        headers.insert(
426            reqwest::header::CONTENT_TYPE,
427            "image/jpeg".parse().unwrap_or_else(|_| unreachable!()),
428        );
429        let check = evaluate_response(
430            "https://example.com/img.jpg",
431            reqwest::StatusCode::OK,
432            &headers,
433            true,
434        );
435        assert_eq!(check.status, CheckStatus::Ok);
436    }
437
438    #[tokio::test]
439    async fn verify_urls_with_mock_server_ok() {
440        let mut server = mockito::Server::new_async().await;
441        let mock = server
442            .mock("HEAD", "/page")
443            .with_status(200)
444            .create_async()
445            .await;
446
447        let urls = vec![UrlEntry {
448            url: format!("{}/page", server.url()),
449            is_thumbnail: false,
450        }];
451
452        let results = verify_urls(urls, 4, 5).await;
453        assert_eq!(results.len(), 1);
454        assert_eq!(results[0].status, CheckStatus::Ok);
455        mock.assert_async().await;
456    }
457
458    #[tokio::test]
459    async fn verify_urls_with_mock_server_404() {
460        let mut server = mockito::Server::new_async().await;
461        let mock = server
462            .mock("HEAD", "/missing")
463            .with_status(404)
464            .create_async()
465            .await;
466
467        let urls = vec![UrlEntry {
468            url: format!("{}/missing", server.url()),
469            is_thumbnail: false,
470        }];
471
472        let results = verify_urls(urls, 4, 5).await;
473        assert_eq!(results.len(), 1);
474        assert_eq!(results[0].status, CheckStatus::Error);
475        assert!(results[0].detail.as_deref().unwrap_or("").contains("404"));
476        mock.assert_async().await;
477    }
478
479    #[tokio::test]
480    async fn verify_urls_head_405_falls_back_to_get() {
481        let mut server = mockito::Server::new_async().await;
482        let head_mock = server
483            .mock("HEAD", "/no-head")
484            .with_status(405)
485            .create_async()
486            .await;
487        let get_mock = server
488            .mock("GET", "/no-head")
489            .with_status(200)
490            .create_async()
491            .await;
492
493        let urls = vec![UrlEntry {
494            url: format!("{}/no-head", server.url()),
495            is_thumbnail: false,
496        }];
497
498        let results = verify_urls(urls, 4, 5).await;
499        assert_eq!(results.len(), 1);
500        assert_eq!(results[0].status, CheckStatus::Ok);
501        head_mock.assert_async().await;
502        get_mock.assert_async().await;
503    }
504
505    #[tokio::test]
506    async fn verify_urls_thumbnail_content_type_check() {
507        let mut server = mockito::Server::new_async().await;
508        let mock = server
509            .mock("HEAD", "/img.jpg")
510            .with_status(200)
511            .with_header("content-type", "image/jpeg")
512            .create_async()
513            .await;
514
515        let urls = vec![UrlEntry {
516            url: format!("{}/img.jpg", server.url()),
517            is_thumbnail: true,
518        }];
519
520        let results = verify_urls(urls, 4, 5).await;
521        assert_eq!(results.len(), 1);
522        assert_eq!(results[0].status, CheckStatus::Ok);
523        mock.assert_async().await;
524    }
525
526    #[tokio::test]
527    async fn verify_urls_thumbnail_wrong_content_type() {
528        let mut server = mockito::Server::new_async().await;
529        let mock = server
530            .mock("HEAD", "/not-image")
531            .with_status(200)
532            .with_header("content-type", "text/html")
533            .create_async()
534            .await;
535
536        let urls = vec![UrlEntry {
537            url: format!("{}/not-image", server.url()),
538            is_thumbnail: true,
539        }];
540
541        let results = verify_urls(urls, 4, 5).await;
542        assert_eq!(results.len(), 1);
543        assert_eq!(results[0].status, CheckStatus::Error);
544        assert!(
545            results[0]
546                .detail
547                .as_deref()
548                .unwrap_or("")
549                .contains("image/*")
550        );
551        mock.assert_async().await;
552    }
553}