cortex_runtime/acquisition/
head_scanner.rs1use super::http_client::{HeadResponse, HttpClient};
7
8#[derive(Debug, Clone)]
10pub struct HeadResult {
11 pub url: String,
13 pub status: u16,
15 pub content_type: Option<String>,
17 pub content_language: Option<String>,
19 pub is_fresh: bool,
21 pub is_html: bool,
23}
24
25pub async fn scan_heads(urls: &[String], client: &HttpClient) -> Vec<HeadResult> {
30 let responses = client.head_many(urls, 20).await;
31
32 responses
33 .into_iter()
34 .zip(urls.iter())
35 .map(|(result, url)| match result {
36 Ok(resp) => head_response_to_result(resp),
37 Err(_) => HeadResult {
38 url: url.clone(),
39 status: 0,
40 content_type: None,
41 content_language: None,
42 is_fresh: false,
43 is_html: false,
44 },
45 })
46 .collect()
47}
48
49pub fn filter_html_urls(results: &[HeadResult]) -> Vec<String> {
51 results
52 .iter()
53 .filter(|r| r.status == 200 && r.is_html)
54 .map(|r| r.url.clone())
55 .collect()
56}
57
58fn head_response_to_result(resp: HeadResponse) -> HeadResult {
59 let is_html = resp
60 .content_type
61 .as_deref()
62 .map(|ct| ct.contains("text/html") || ct.contains("application/xhtml"))
63 .unwrap_or(true); let is_fresh = resp
66 .cache_control
67 .as_deref()
68 .map(|cc| cc.contains("no-cache") || cc.contains("must-revalidate"))
69 .unwrap_or(false)
70 || resp.last_modified.is_some();
71
72 HeadResult {
73 url: resp.url,
74 status: resp.status,
75 content_type: resp.content_type,
76 content_language: resp.content_language,
77 is_fresh,
78 is_html,
79 }
80}
81
82#[cfg(test)]
83mod tests {
84 use super::*;
85
86 #[test]
87 fn test_head_response_to_result_html() {
88 let resp = HeadResponse {
89 url: "https://example.com/".to_string(),
90 status: 200,
91 content_type: Some("text/html; charset=utf-8".to_string()),
92 content_language: Some("en".to_string()),
93 last_modified: Some("Tue, 15 Jan 2026 12:00:00 GMT".to_string()),
94 cache_control: None,
95 };
96
97 let result = head_response_to_result(resp);
98 assert!(result.is_html);
99 assert!(result.is_fresh);
100 assert_eq!(result.status, 200);
101 }
102
103 #[test]
104 fn test_head_response_to_result_non_html() {
105 let resp = HeadResponse {
106 url: "https://example.com/image.png".to_string(),
107 status: 200,
108 content_type: Some("image/png".to_string()),
109 content_language: None,
110 last_modified: None,
111 cache_control: None,
112 };
113
114 let result = head_response_to_result(resp);
115 assert!(!result.is_html);
116 assert!(!result.is_fresh);
117 }
118
119 #[test]
120 fn test_filter_html_urls() {
121 let results = vec![
122 HeadResult {
123 url: "https://example.com/page".to_string(),
124 status: 200,
125 content_type: Some("text/html".to_string()),
126 content_language: None,
127 is_fresh: false,
128 is_html: true,
129 },
130 HeadResult {
131 url: "https://example.com/image.png".to_string(),
132 status: 200,
133 content_type: Some("image/png".to_string()),
134 content_language: None,
135 is_fresh: false,
136 is_html: false,
137 },
138 HeadResult {
139 url: "https://example.com/missing".to_string(),
140 status: 404,
141 content_type: Some("text/html".to_string()),
142 content_language: None,
143 is_fresh: false,
144 is_html: true,
145 },
146 ];
147
148 let html_urls = filter_html_urls(&results);
149 assert_eq!(html_urls.len(), 1);
150 assert_eq!(html_urls[0], "https://example.com/page");
151 }
152}