Skip to main content

weave_content/
verifier.rs

1use std::time::Duration;
2
3use crate::entity::{Entity, FieldValue};
4use crate::parser::ParseError;
5use crate::relationship::Rel;
6
7/// Maximum total URLs per verify run.
8const MAX_URLS_PER_RUN: usize = 500;
9
10/// Maximum redirect hops.
11const MAX_REDIRECTS: usize = 5;
12
13/// User-Agent header.
14const USER_AGENT: &str = "weave-content/0.2 (+https://github.com/redberrythread/weave)";
15
16/// Result of checking a single URL.
17#[derive(Debug)]
18pub struct UrlCheck {
19    pub url: String,
20    pub status: CheckStatus,
21    pub detail: Option<String>,
22    /// Whether this URL is a thumbnail (needs content-type check).
23    pub is_thumbnail: bool,
24}
25
26/// Severity level of a URL check result.
27#[derive(Debug, Clone, Copy, PartialEq, Eq)]
28pub enum CheckStatus {
29    Ok,
30    Warn,
31    Error,
32}
33
34impl std::fmt::Display for CheckStatus {
35    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
36        match self {
37            Self::Ok => write!(f, "ok"),
38            Self::Warn => write!(f, "warn"),
39            Self::Error => write!(f, "error"),
40        }
41    }
42}
43
44/// Collected URL to verify with its source location.
45#[derive(Debug, Clone)]
46pub struct UrlEntry {
47    url: String,
48    is_thumbnail: bool,
49}
50
51impl UrlEntry {
52    /// URL accessor.
53    pub fn url(&self) -> &str {
54        &self.url
55    }
56
57    /// Thumbnail flag accessor.
58    pub fn is_thumbnail(&self) -> bool {
59        self.is_thumbnail
60    }
61}
62
63/// Collect all URLs from parsed case data for verification.
64pub fn collect_urls(
65    sources: &[String],
66    entities: &[Entity],
67    rels: &[Rel],
68    errors: &mut Vec<ParseError>,
69) -> Vec<UrlEntry> {
70    let mut urls = Vec::new();
71
72    // Front matter sources
73    for url in sources {
74        urls.push(UrlEntry {
75            url: url.clone(),
76            is_thumbnail: false,
77        });
78    }
79
80    // Entity URLs and thumbnails
81    for entity in entities {
82        for (key, value) in &entity.fields {
83            match key.as_str() {
84                "thumbnail" | "thumbnail_source" => {
85                    if let FieldValue::Single(url) = value
86                        && !url.is_empty()
87                    {
88                        urls.push(UrlEntry {
89                            url: url.clone(),
90                            is_thumbnail: true,
91                        });
92                    }
93                }
94                "urls" => {
95                    if let FieldValue::List(items) = value {
96                        for url in items {
97                            urls.push(UrlEntry {
98                                url: url.clone(),
99                                is_thumbnail: false,
100                            });
101                        }
102                    }
103                }
104                _ => {}
105            }
106        }
107    }
108
109    // Relationship source URL overrides
110    for rel in rels {
111        for url in &rel.source_urls {
112            urls.push(UrlEntry {
113                url: url.clone(),
114                is_thumbnail: false,
115            });
116        }
117    }
118
119    // Deduplicate by URL
120    let mut seen = std::collections::HashSet::new();
121    urls.retain(|entry| seen.insert(entry.url.clone()));
122
123    // Boundary check
124    if urls.len() > MAX_URLS_PER_RUN {
125        errors.push(ParseError {
126            line: 0,
127            message: format!(
128                "too many URLs to verify (max {MAX_URLS_PER_RUN}, got {})",
129                urls.len()
130            ),
131        });
132    }
133
134    urls
135}
136
137/// Verify all collected URLs concurrently.
138pub async fn verify_urls(
139    urls: Vec<UrlEntry>,
140    concurrency: usize,
141    timeout_secs: u64,
142) -> Vec<UrlCheck> {
143    let client = reqwest::Client::builder()
144        .user_agent(USER_AGENT)
145        .redirect(reqwest::redirect::Policy::limited(MAX_REDIRECTS))
146        .timeout(Duration::from_secs(timeout_secs))
147        .build()
148        .unwrap_or_else(|_| reqwest::Client::new());
149
150    let semaphore = std::sync::Arc::new(tokio::sync::Semaphore::new(concurrency));
151    let client = std::sync::Arc::new(client);
152
153    let mut handles = Vec::new();
154
155    for entry in urls {
156        let sem = semaphore.clone();
157        let cli = client.clone();
158        handles.push(tokio::spawn(async move {
159            let _permit = sem.acquire().await;
160            check_url(&cli, &entry.url, entry.is_thumbnail).await
161        }));
162    }
163
164    let mut results = Vec::new();
165    for handle in handles {
166        match handle.await {
167            Ok(check) => results.push(check),
168            Err(e) => results.push(UrlCheck {
169                url: "unknown".into(),
170                status: CheckStatus::Error,
171                detail: Some(format!("task panicked: {e}")),
172                is_thumbnail: false,
173            }),
174        }
175    }
176
177    results
178}
179
180async fn check_url(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
181    // Try HEAD first
182    match client.head(url).send().await {
183        Ok(resp) => {
184            let status = resp.status();
185
186            // If HEAD returns 405, try GET
187            if status == reqwest::StatusCode::METHOD_NOT_ALLOWED {
188                return check_url_get(client, url, is_thumbnail).await;
189            }
190
191            evaluate_response(url, status, resp.headers(), is_thumbnail)
192        }
193        Err(e) => {
194            if e.is_timeout() {
195                UrlCheck {
196                    url: url.to_string(),
197                    status: CheckStatus::Warn,
198                    detail: Some("timeout".into()),
199                    is_thumbnail,
200                }
201            } else {
202                UrlCheck {
203                    url: url.to_string(),
204                    status: CheckStatus::Error,
205                    detail: Some(format!("{e}")),
206                    is_thumbnail,
207                }
208            }
209        }
210    }
211}
212
213async fn check_url_get(client: &reqwest::Client, url: &str, is_thumbnail: bool) -> UrlCheck {
214    match client.get(url).send().await {
215        Ok(resp) => evaluate_response(url, resp.status(), resp.headers(), is_thumbnail),
216        Err(e) => {
217            if e.is_timeout() {
218                UrlCheck {
219                    url: url.to_string(),
220                    status: CheckStatus::Warn,
221                    detail: Some("timeout".into()),
222                    is_thumbnail,
223                }
224            } else {
225                UrlCheck {
226                    url: url.to_string(),
227                    status: CheckStatus::Error,
228                    detail: Some(format!("{e}")),
229                    is_thumbnail,
230                }
231            }
232        }
233    }
234}
235
236fn evaluate_response(
237    url: &str,
238    status: reqwest::StatusCode,
239    headers: &reqwest::header::HeaderMap,
240    is_thumbnail: bool,
241) -> UrlCheck {
242    if status.is_success() {
243        // Check thumbnail content-type
244        if is_thumbnail && let Some(ct) = headers.get(reqwest::header::CONTENT_TYPE) {
245            let ct_str = ct.to_str().unwrap_or("");
246            if !ct_str.starts_with("image/") {
247                return UrlCheck {
248                    url: url.to_string(),
249                    status: CheckStatus::Error,
250                    detail: Some(format!("expected content-type image/*, got {ct_str}")),
251                    is_thumbnail,
252                };
253            }
254        }
255
256        UrlCheck {
257            url: url.to_string(),
258            status: CheckStatus::Ok,
259            detail: None,
260            is_thumbnail,
261        }
262    } else if status.is_redirection() {
263        // Redirect not followed (should be handled by client policy)
264        UrlCheck {
265            url: url.to_string(),
266            status: CheckStatus::Warn,
267            detail: Some(format!("HTTP {status}")),
268            is_thumbnail,
269        }
270    } else {
271        UrlCheck {
272            url: url.to_string(),
273            status: CheckStatus::Error,
274            detail: Some(format!("HTTP {status}")),
275            is_thumbnail,
276        }
277    }
278}
279
280#[cfg(test)]
281mod tests {
282    use super::*;
283
284    #[test]
285    fn collect_urls_deduplicates() {
286        let sources = vec!["https://a.com".into(), "https://b.com".into()];
287        let entities = vec![Entity {
288            name: "Test".into(),
289            label: crate::entity::Label::Actor,
290            fields: vec![(
291                "urls".into(),
292                FieldValue::List(vec!["https://a.com".into(), "https://c.com".into()]),
293            )],
294            id: None,
295            line: 1,
296        }];
297        let mut errors = Vec::new();
298
299        let urls = collect_urls(&sources, &entities, &[], &mut errors);
300        assert!(errors.is_empty());
301        // a.com deduplicated
302        assert_eq!(urls.len(), 3);
303    }
304
305    #[test]
306    fn collect_urls_includes_thumbnails() {
307        let entities = vec![Entity {
308            name: "Test".into(),
309            label: crate::entity::Label::Actor,
310            fields: vec![(
311                "thumbnail".into(),
312                FieldValue::Single("https://img.com/photo.jpg".into()),
313            )],
314            id: None,
315            line: 1,
316        }];
317        let mut errors = Vec::new();
318
319        let urls = collect_urls(&[], &entities, &[], &mut errors);
320        assert_eq!(urls.len(), 1);
321        assert!(urls[0].is_thumbnail);
322    }
323
324    #[test]
325    fn collect_urls_includes_rel_sources() {
326        let rels = vec![Rel {
327            source_name: "A".into(),
328            target_name: "B".into(),
329            rel_type: "related_to".into(),
330            source_urls: vec!["https://src.com".into()],
331            fields: vec![],
332            id: None,
333            line: 1,
334        }];
335        let mut errors = Vec::new();
336
337        let urls = collect_urls(&[], &[], &rels, &mut errors);
338        assert_eq!(urls.len(), 1);
339        assert!(!urls[0].is_thumbnail);
340    }
341
342    #[test]
343    fn collect_urls_boundary() {
344        let sources: Vec<String> = (0..501)
345            .map(|i| format!("https://example.com/{i}"))
346            .collect();
347        let mut errors = Vec::new();
348
349        collect_urls(&sources, &[], &[], &mut errors);
350        assert!(errors.iter().any(|e| e.message.contains("too many URLs")));
351    }
352
353    #[test]
354    fn evaluate_success() {
355        let check = evaluate_response(
356            "https://example.com",
357            reqwest::StatusCode::OK,
358            &reqwest::header::HeaderMap::new(),
359            false,
360        );
361        assert_eq!(check.status, CheckStatus::Ok);
362    }
363
364    #[test]
365    fn evaluate_not_found() {
366        let check = evaluate_response(
367            "https://example.com",
368            reqwest::StatusCode::NOT_FOUND,
369            &reqwest::header::HeaderMap::new(),
370            false,
371        );
372        assert_eq!(check.status, CheckStatus::Error);
373    }
374
375    #[test]
376    fn evaluate_thumbnail_wrong_content_type() {
377        let mut headers = reqwest::header::HeaderMap::new();
378        headers.insert(
379            reqwest::header::CONTENT_TYPE,
380            "text/html".parse().unwrap_or_else(|_| unreachable!()),
381        );
382        let check = evaluate_response(
383            "https://example.com/img.jpg",
384            reqwest::StatusCode::OK,
385            &headers,
386            true,
387        );
388        assert_eq!(check.status, CheckStatus::Error);
389        assert!(check.detail.as_deref().unwrap_or("").contains("image/*"));
390    }
391
392    #[test]
393    fn evaluate_thumbnail_correct_content_type() {
394        let mut headers = reqwest::header::HeaderMap::new();
395        headers.insert(
396            reqwest::header::CONTENT_TYPE,
397            "image/jpeg".parse().unwrap_or_else(|_| unreachable!()),
398        );
399        let check = evaluate_response(
400            "https://example.com/img.jpg",
401            reqwest::StatusCode::OK,
402            &headers,
403            true,
404        );
405        assert_eq!(check.status, CheckStatus::Ok);
406    }
407
408    #[tokio::test]
409    async fn verify_urls_with_mock_server_ok() {
410        let mut server = mockito::Server::new_async().await;
411        let mock = server
412            .mock("HEAD", "/page")
413            .with_status(200)
414            .create_async()
415            .await;
416
417        let urls = vec![UrlEntry {
418            url: format!("{}/page", server.url()),
419            is_thumbnail: false,
420        }];
421
422        let results = verify_urls(urls, 4, 5).await;
423        assert_eq!(results.len(), 1);
424        assert_eq!(results[0].status, CheckStatus::Ok);
425        mock.assert_async().await;
426    }
427
428    #[tokio::test]
429    async fn verify_urls_with_mock_server_404() {
430        let mut server = mockito::Server::new_async().await;
431        let mock = server
432            .mock("HEAD", "/missing")
433            .with_status(404)
434            .create_async()
435            .await;
436
437        let urls = vec![UrlEntry {
438            url: format!("{}/missing", server.url()),
439            is_thumbnail: false,
440        }];
441
442        let results = verify_urls(urls, 4, 5).await;
443        assert_eq!(results.len(), 1);
444        assert_eq!(results[0].status, CheckStatus::Error);
445        assert!(results[0].detail.as_deref().unwrap_or("").contains("404"));
446        mock.assert_async().await;
447    }
448
449    #[tokio::test]
450    async fn verify_urls_head_405_falls_back_to_get() {
451        let mut server = mockito::Server::new_async().await;
452        let head_mock = server
453            .mock("HEAD", "/no-head")
454            .with_status(405)
455            .create_async()
456            .await;
457        let get_mock = server
458            .mock("GET", "/no-head")
459            .with_status(200)
460            .create_async()
461            .await;
462
463        let urls = vec![UrlEntry {
464            url: format!("{}/no-head", server.url()),
465            is_thumbnail: false,
466        }];
467
468        let results = verify_urls(urls, 4, 5).await;
469        assert_eq!(results.len(), 1);
470        assert_eq!(results[0].status, CheckStatus::Ok);
471        head_mock.assert_async().await;
472        get_mock.assert_async().await;
473    }
474
475    #[tokio::test]
476    async fn verify_urls_thumbnail_content_type_check() {
477        let mut server = mockito::Server::new_async().await;
478        let mock = server
479            .mock("HEAD", "/img.jpg")
480            .with_status(200)
481            .with_header("content-type", "image/jpeg")
482            .create_async()
483            .await;
484
485        let urls = vec![UrlEntry {
486            url: format!("{}/img.jpg", server.url()),
487            is_thumbnail: true,
488        }];
489
490        let results = verify_urls(urls, 4, 5).await;
491        assert_eq!(results.len(), 1);
492        assert_eq!(results[0].status, CheckStatus::Ok);
493        mock.assert_async().await;
494    }
495
496    #[tokio::test]
497    async fn verify_urls_thumbnail_wrong_content_type() {
498        let mut server = mockito::Server::new_async().await;
499        let mock = server
500            .mock("HEAD", "/not-image")
501            .with_status(200)
502            .with_header("content-type", "text/html")
503            .create_async()
504            .await;
505
506        let urls = vec![UrlEntry {
507            url: format!("{}/not-image", server.url()),
508            is_thumbnail: true,
509        }];
510
511        let results = verify_urls(urls, 4, 5).await;
512        assert_eq!(results.len(), 1);
513        assert_eq!(results[0].status, CheckStatus::Error);
514        assert!(
515            results[0]
516                .detail
517                .as_deref()
518                .unwrap_or("")
519                .contains("image/*")
520        );
521        mock.assert_async().await;
522    }
523}