lean_ctx/core/web/
fetch.rs1use std::io::Read;
9use std::time::Duration;
10
11use super::url_guard::{self, SafeUrl};
12
13pub const DEFAULT_MAX_BYTES: usize = 4 * 1024 * 1024;
15pub const DEFAULT_TIMEOUT_SECS: u64 = 20;
17
18const MAX_REDIRECTS: u32 = 5;
19const USER_AGENT: &str = "lean-ctx/3.7 (+https://leanctx.com; ctx_url_read)";
20const ACCEPT: &str = "text/html,application/xhtml+xml,text/plain;q=0.9,*/*;q=0.5";
21
22pub struct FetchedDoc {
27 pub final_url: String,
28 pub content_type: String,
30 pub bytes: Vec<u8>,
31 pub status: u16,
32 pub truncated: bool,
33}
34
35impl FetchedDoc {
36 pub fn body_text(&self) -> String {
38 String::from_utf8_lossy(&self.bytes).into_owned()
39 }
40}
41
42pub fn fetch(url: &str, max_bytes: usize, timeout_secs: u64) -> Result<FetchedDoc, String> {
44 let mut current = url_guard::validate(url).map_err(|e| e.to_string())?;
45 current
46 .ensure_resolves_safely()
47 .map_err(|e| e.to_string())?;
48
49 let agent = build_agent(timeout_secs);
50 let mut hops = 0u32;
51
52 loop {
53 let resp = agent
54 .get(¤t.normalized)
55 .header("user-agent", USER_AGENT)
56 .header("accept", ACCEPT)
57 .header("accept-language", "en,*;q=0.5")
58 .call()
59 .map_err(|e| format!("request failed: {e}"))?;
60
61 let status = resp.status().as_u16();
62
63 if (300..400).contains(&status) && hops < MAX_REDIRECTS {
64 if let Some(location) = header_value(&resp, "location") {
65 let next = resolve_redirect(¤t, &location);
66 let next_url = url_guard::validate(&next).map_err(|e| e.to_string())?;
67 next_url
68 .ensure_resolves_safely()
69 .map_err(|e| e.to_string())?;
70 current = next_url;
71 hops += 1;
72 continue;
73 }
74 }
75
76 let content_type = header_value(&resp, "content-type")
77 .and_then(|v| v.split(';').next().map(|m| m.trim().to_ascii_lowercase()))
78 .unwrap_or_default();
79 let (bytes, truncated) = read_bounded(resp, max_bytes)?;
80
81 return Ok(FetchedDoc {
82 final_url: current.normalized.clone(),
83 content_type,
84 bytes,
85 status,
86 truncated,
87 });
88 }
89}
90
91pub fn post(
98 url: &str,
99 content_type: &str,
100 user_agent: &str,
101 body: &str,
102 max_bytes: usize,
103 timeout_secs: u64,
104) -> Result<FetchedDoc, String> {
105 let target = url_guard::validate(url).map_err(|e| e.to_string())?;
106 target.ensure_resolves_safely().map_err(|e| e.to_string())?;
107
108 let agent = build_agent(timeout_secs);
109 let resp = agent
110 .post(&target.normalized)
111 .header("user-agent", user_agent)
112 .header("content-type", content_type)
113 .header("accept", "application/json, text/xml;q=0.9, */*;q=0.5")
114 .send(body.as_bytes())
115 .map_err(|e| format!("request failed: {e}"))?;
116
117 let status = resp.status().as_u16();
118 let content_type = header_value(&resp, "content-type")
119 .and_then(|v| v.split(';').next().map(|m| m.trim().to_ascii_lowercase()))
120 .unwrap_or_default();
121 let (bytes, truncated) = read_bounded(resp, max_bytes)?;
122
123 Ok(FetchedDoc {
124 final_url: target.normalized,
125 content_type,
126 bytes,
127 status,
128 truncated,
129 })
130}
131
132fn build_agent(timeout_secs: u64) -> ureq::Agent {
133 ureq::Agent::new_with_config(
134 ureq::config::Config::builder()
135 .timeout_global(Some(Duration::from_secs(timeout_secs)))
136 .max_redirects(0)
137 .http_status_as_error(false)
138 .build(),
139 )
140}
141
142fn header_value<B>(resp: &ureq::http::Response<B>, name: &str) -> Option<String> {
143 resp.headers()
144 .get(name)
145 .and_then(|v| v.to_str().ok())
146 .map(str::to_string)
147}
148
149fn read_bounded(
150 resp: ureq::http::Response<ureq::Body>,
151 max_bytes: usize,
152) -> Result<(Vec<u8>, bool), String> {
153 let mut reader = resp.into_body().into_reader();
154 let mut buf: Vec<u8> = Vec::with_capacity(8192.min(max_bytes.max(1)));
155 let mut chunk = [0u8; 8192];
156 let mut truncated = false;
157
158 loop {
159 let n = reader
160 .read(&mut chunk)
161 .map_err(|e| format!("failed to read body: {e}"))?;
162 if n == 0 {
163 break;
164 }
165 let remaining = max_bytes.saturating_sub(buf.len());
166 if remaining == 0 {
167 truncated = true;
168 break;
169 }
170 let take = n.min(remaining);
171 buf.extend_from_slice(&chunk[..take]);
172 if take < n {
173 truncated = true;
174 break;
175 }
176 }
177
178 Ok((buf, truncated))
179}
180
181pub(crate) fn resolve_redirect(base: &SafeUrl, location: &str) -> String {
184 let loc = location.trim();
185
186 if loc.starts_with("http://") || loc.starts_with("https://") {
187 return loc.to_string();
188 }
189 if let Some(rest) = loc.strip_prefix("//") {
190 return format!("{}://{rest}", base.scheme);
191 }
192 if loc.starts_with('/') {
193 return format!("{}://{}{loc}", base.scheme, base.authority);
194 }
195
196 let base_path = base_path(base);
198 let dir = match base_path.rfind('/') {
199 Some(i) => &base_path[..=i],
200 None => "/",
201 };
202 format!("{}://{}{dir}{loc}", base.scheme, base.authority)
203}
204
205fn base_path(base: &SafeUrl) -> &str {
206 let prefix_len = base.scheme.len() + 3 + base.authority.len();
207 let path = base.normalized.get(prefix_len..).unwrap_or("");
208 let path = path.split(['?', '#']).next().unwrap_or("");
209 if path.is_empty() {
210 "/"
211 } else {
212 path
213 }
214}
215
216#[cfg(test)]
217mod tests {
218 use super::*;
219
220 fn safe(url: &str) -> SafeUrl {
221 url_guard::validate(url).unwrap()
222 }
223
224 #[test]
225 fn redirect_absolute_is_passthrough() {
226 let base = safe("https://a.com/x");
227 assert_eq!(
228 resolve_redirect(&base, "https://b.com/y"),
229 "https://b.com/y"
230 );
231 }
232
233 #[test]
234 fn redirect_scheme_relative() {
235 let base = safe("https://a.com/x");
236 assert_eq!(resolve_redirect(&base, "//c.com/z"), "https://c.com/z");
237 }
238
239 #[test]
240 fn redirect_root_relative() {
241 let base = safe("https://a.com/deep/path?q=1");
242 assert_eq!(resolve_redirect(&base, "/new"), "https://a.com/new");
243 }
244
245 #[test]
246 fn redirect_path_relative_joins_dir() {
247 let base = safe("https://a.com/dir/page.html");
248 assert_eq!(
249 resolve_redirect(&base, "other.html"),
250 "https://a.com/dir/other.html"
251 );
252 }
253
254 #[test]
255 fn redirect_path_relative_from_root() {
256 let base = safe("https://a.com");
257 assert_eq!(resolve_redirect(&base, "page"), "https://a.com/page");
258 }
259}