Skip to main content

cortex_runtime/acquisition/
head_scanner.rs

1//! Parallel HEAD request scanner for URL metadata.
2//!
3//! Quickly determines status, content-type, language, and freshness
4//! for discovered URLs without downloading bodies.
5
6use super::http_client::{HeadResponse, HttpClient};
7
8/// Result of scanning a URL with HEAD.
9#[derive(Debug, Clone)]
10pub struct HeadResult {
11    /// The scanned URL.
12    pub url: String,
13    /// HTTP status code (0 if request failed).
14    pub status: u16,
15    /// Content type (e.g., "text/html").
16    pub content_type: Option<String>,
17    /// Content language.
18    pub content_language: Option<String>,
19    /// Whether the page is fresh (has recent Last-Modified or no-cache).
20    pub is_fresh: bool,
21    /// Whether this is an HTML page.
22    pub is_html: bool,
23}
24
25/// Scan URLs with parallel HEAD requests.
26///
27/// Returns metadata for each URL without downloading page bodies.
28/// Non-HTML URLs and error responses are marked accordingly.
29pub async fn scan_heads(urls: &[String], client: &HttpClient) -> Vec<HeadResult> {
30    let responses = client.head_many(urls, 20).await;
31
32    responses
33        .into_iter()
34        .zip(urls.iter())
35        .map(|(result, url)| match result {
36            Ok(resp) => head_response_to_result(resp),
37            Err(_) => HeadResult {
38                url: url.clone(),
39                status: 0,
40                content_type: None,
41                content_language: None,
42                is_fresh: false,
43                is_html: false,
44            },
45        })
46        .collect()
47}
48
49/// Filter URLs to only those that are HTML pages (status 200 + text/html).
50pub fn filter_html_urls(results: &[HeadResult]) -> Vec<String> {
51    results
52        .iter()
53        .filter(|r| r.status == 200 && r.is_html)
54        .map(|r| r.url.clone())
55        .collect()
56}
57
58fn head_response_to_result(resp: HeadResponse) -> HeadResult {
59    let is_html = resp
60        .content_type
61        .as_deref()
62        .map(|ct| ct.contains("text/html") || ct.contains("application/xhtml"))
63        .unwrap_or(true); // assume HTML if no content-type
64
65    let is_fresh = resp
66        .cache_control
67        .as_deref()
68        .map(|cc| cc.contains("no-cache") || cc.contains("must-revalidate"))
69        .unwrap_or(false)
70        || resp.last_modified.is_some();
71
72    HeadResult {
73        url: resp.url,
74        status: resp.status,
75        content_type: resp.content_type,
76        content_language: resp.content_language,
77        is_fresh,
78        is_html,
79    }
80}
81
82#[cfg(test)]
83mod tests {
84    use super::*;
85
86    #[test]
87    fn test_head_response_to_result_html() {
88        let resp = HeadResponse {
89            url: "https://example.com/".to_string(),
90            status: 200,
91            content_type: Some("text/html; charset=utf-8".to_string()),
92            content_language: Some("en".to_string()),
93            last_modified: Some("Tue, 15 Jan 2026 12:00:00 GMT".to_string()),
94            cache_control: None,
95        };
96
97        let result = head_response_to_result(resp);
98        assert!(result.is_html);
99        assert!(result.is_fresh);
100        assert_eq!(result.status, 200);
101    }
102
103    #[test]
104    fn test_head_response_to_result_non_html() {
105        let resp = HeadResponse {
106            url: "https://example.com/image.png".to_string(),
107            status: 200,
108            content_type: Some("image/png".to_string()),
109            content_language: None,
110            last_modified: None,
111            cache_control: None,
112        };
113
114        let result = head_response_to_result(resp);
115        assert!(!result.is_html);
116        assert!(!result.is_fresh);
117    }
118
119    #[test]
120    fn test_filter_html_urls() {
121        let results = vec![
122            HeadResult {
123                url: "https://example.com/page".to_string(),
124                status: 200,
125                content_type: Some("text/html".to_string()),
126                content_language: None,
127                is_fresh: false,
128                is_html: true,
129            },
130            HeadResult {
131                url: "https://example.com/image.png".to_string(),
132                status: 200,
133                content_type: Some("image/png".to_string()),
134                content_language: None,
135                is_fresh: false,
136                is_html: false,
137            },
138            HeadResult {
139                url: "https://example.com/missing".to_string(),
140                status: 404,
141                content_type: Some("text/html".to_string()),
142                content_language: None,
143                is_fresh: false,
144                is_html: true,
145            },
146        ];
147
148        let html_urls = filter_html_urls(&results);
149        assert_eq!(html_urls.len(), 1);
150        assert_eq!(html_urls[0], "https://example.com/page");
151    }
152}