lean_ctx/core/web/
fetch.rs

1//! Bounded, SSRF-aware HTTP fetch built on `ureq`.
2//!
3//! Redirects are followed manually so every hop passes back through
4//! [`url_guard`], closing the redirect-to-internal SSRF hole that automatic
5//! redirect following would open. Response bodies are capped to a byte budget so
6//! a hostile server cannot exhaust memory.
7
8use std::io::Read;
9use std::time::Duration;
10
11use super::url_guard::{self, SafeUrl};
12
13/// Default response body cap (4 MiB) — generous for articles, safe for memory.
14pub const DEFAULT_MAX_BYTES: usize = 4 * 1024 * 1024;
15/// Default total request timeout in seconds.
16pub const DEFAULT_TIMEOUT_SECS: u64 = 20;
17
18const MAX_REDIRECTS: u32 = 5;
19const USER_AGENT: &str = "lean-ctx/3.7 (+https://leanctx.com; ctx_url_read)";
20const ACCEPT: &str = "text/html,application/xhtml+xml,text/plain;q=0.9,*/*;q=0.5";
21
22/// A fetched document with its raw body bytes and resolved metadata.
23///
24/// The body is kept as bytes so binary payloads (e.g. PDF) survive intact;
25/// textual callers use [`FetchedDoc::body_text`] for a lossy UTF-8 view.
26pub struct FetchedDoc {
27    pub final_url: String,
28    /// Lower-cased MIME type without parameters (e.g. `text/html`).
29    pub content_type: String,
30    pub bytes: Vec<u8>,
31    pub status: u16,
32    pub truncated: bool,
33}
34
35impl FetchedDoc {
36    /// Lossy UTF-8 view of the body, for textual content (HTML, JSON, …).
37    pub fn body_text(&self) -> String {
38        String::from_utf8_lossy(&self.bytes).into_owned()
39    }
40}
41
42/// Fetch `url`, following up to `MAX_REDIRECTS` re-validated redirects.
43pub fn fetch(url: &str, max_bytes: usize, timeout_secs: u64) -> Result<FetchedDoc, String> {
44    let mut current = url_guard::validate(url).map_err(|e| e.to_string())?;
45    current
46        .ensure_resolves_safely()
47        .map_err(|e| e.to_string())?;
48
49    let agent = build_agent(timeout_secs);
50    let mut hops = 0u32;
51
52    loop {
53        let resp = agent
54            .get(&current.normalized)
55            .header("user-agent", USER_AGENT)
56            .header("accept", ACCEPT)
57            .header("accept-language", "en,*;q=0.5")
58            .call()
59            .map_err(|e| format!("request failed: {e}"))?;
60
61        let status = resp.status().as_u16();
62
63        if (300..400).contains(&status) && hops < MAX_REDIRECTS {
64            if let Some(location) = header_value(&resp, "location") {
65                let next = resolve_redirect(&current, &location);
66                let next_url = url_guard::validate(&next).map_err(|e| e.to_string())?;
67                next_url
68                    .ensure_resolves_safely()
69                    .map_err(|e| e.to_string())?;
70                current = next_url;
71                hops += 1;
72                continue;
73            }
74        }
75
76        let content_type = header_value(&resp, "content-type")
77            .and_then(|v| v.split(';').next().map(|m| m.trim().to_ascii_lowercase()))
78            .unwrap_or_default();
79        let (bytes, truncated) = read_bounded(resp, max_bytes)?;
80
81        return Ok(FetchedDoc {
82            final_url: current.normalized.clone(),
83            content_type,
84            bytes,
85            status,
86            truncated,
87        });
88    }
89}
90
91/// POST `body` to `url` (SSRF-guarded, bounded, redirects not followed).
92///
93/// Needed for JSON-RPC style endpoints — e.g. YouTube's InnerTube `player`
94/// API, whose caption URLs are server-fetchable (unlike the watch-page ones).
95/// `user_agent` is explicit because some APIs validate it against the declared
96/// client.
97pub fn post(
98    url: &str,
99    content_type: &str,
100    user_agent: &str,
101    body: &str,
102    max_bytes: usize,
103    timeout_secs: u64,
104) -> Result<FetchedDoc, String> {
105    let target = url_guard::validate(url).map_err(|e| e.to_string())?;
106    target.ensure_resolves_safely().map_err(|e| e.to_string())?;
107
108    let agent = build_agent(timeout_secs);
109    let resp = agent
110        .post(&target.normalized)
111        .header("user-agent", user_agent)
112        .header("content-type", content_type)
113        .header("accept", "application/json, text/xml;q=0.9, */*;q=0.5")
114        .send(body.as_bytes())
115        .map_err(|e| format!("request failed: {e}"))?;
116
117    let status = resp.status().as_u16();
118    let content_type = header_value(&resp, "content-type")
119        .and_then(|v| v.split(';').next().map(|m| m.trim().to_ascii_lowercase()))
120        .unwrap_or_default();
121    let (bytes, truncated) = read_bounded(resp, max_bytes)?;
122
123    Ok(FetchedDoc {
124        final_url: target.normalized,
125        content_type,
126        bytes,
127        status,
128        truncated,
129    })
130}
131
132fn build_agent(timeout_secs: u64) -> ureq::Agent {
133    ureq::Agent::new_with_config(
134        ureq::config::Config::builder()
135            .timeout_global(Some(Duration::from_secs(timeout_secs)))
136            .max_redirects(0)
137            .http_status_as_error(false)
138            .build(),
139    )
140}
141
142fn header_value<B>(resp: &ureq::http::Response<B>, name: &str) -> Option<String> {
143    resp.headers()
144        .get(name)
145        .and_then(|v| v.to_str().ok())
146        .map(str::to_string)
147}
148
149fn read_bounded(
150    resp: ureq::http::Response<ureq::Body>,
151    max_bytes: usize,
152) -> Result<(Vec<u8>, bool), String> {
153    let mut reader = resp.into_body().into_reader();
154    let mut buf: Vec<u8> = Vec::with_capacity(8192.min(max_bytes.max(1)));
155    let mut chunk = [0u8; 8192];
156    let mut truncated = false;
157
158    loop {
159        let n = reader
160            .read(&mut chunk)
161            .map_err(|e| format!("failed to read body: {e}"))?;
162        if n == 0 {
163            break;
164        }
165        let remaining = max_bytes.saturating_sub(buf.len());
166        if remaining == 0 {
167            truncated = true;
168            break;
169        }
170        let take = n.min(remaining);
171        buf.extend_from_slice(&chunk[..take]);
172        if take < n {
173            truncated = true;
174            break;
175        }
176    }
177
178    Ok((buf, truncated))
179}
180
181/// Resolve a (possibly relative) `location` (redirect target or link href)
182/// against a base URL.
183pub(crate) fn resolve_redirect(base: &SafeUrl, location: &str) -> String {
184    let loc = location.trim();
185
186    if loc.starts_with("http://") || loc.starts_with("https://") {
187        return loc.to_string();
188    }
189    if let Some(rest) = loc.strip_prefix("//") {
190        return format!("{}://{rest}", base.scheme);
191    }
192    if loc.starts_with('/') {
193        return format!("{}://{}{loc}", base.scheme, base.authority);
194    }
195
196    // Path-relative: join against the directory of the current path.
197    let base_path = base_path(base);
198    let dir = match base_path.rfind('/') {
199        Some(i) => &base_path[..=i],
200        None => "/",
201    };
202    format!("{}://{}{dir}{loc}", base.scheme, base.authority)
203}
204
205fn base_path(base: &SafeUrl) -> &str {
206    let prefix_len = base.scheme.len() + 3 + base.authority.len();
207    let path = base.normalized.get(prefix_len..).unwrap_or("");
208    let path = path.split(['?', '#']).next().unwrap_or("");
209    if path.is_empty() {
210        "/"
211    } else {
212        path
213    }
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219
220    fn safe(url: &str) -> SafeUrl {
221        url_guard::validate(url).unwrap()
222    }
223
224    #[test]
225    fn redirect_absolute_is_passthrough() {
226        let base = safe("https://a.com/x");
227        assert_eq!(
228            resolve_redirect(&base, "https://b.com/y"),
229            "https://b.com/y"
230        );
231    }
232
233    #[test]
234    fn redirect_scheme_relative() {
235        let base = safe("https://a.com/x");
236        assert_eq!(resolve_redirect(&base, "//c.com/z"), "https://c.com/z");
237    }
238
239    #[test]
240    fn redirect_root_relative() {
241        let base = safe("https://a.com/deep/path?q=1");
242        assert_eq!(resolve_redirect(&base, "/new"), "https://a.com/new");
243    }
244
245    #[test]
246    fn redirect_path_relative_joins_dir() {
247        let base = safe("https://a.com/dir/page.html");
248        assert_eq!(
249            resolve_redirect(&base, "other.html"),
250            "https://a.com/dir/other.html"
251        );
252    }
253
254    #[test]
255    fn redirect_path_relative_from_root() {
256        let base = safe("https://a.com");
257        assert_eq!(resolve_redirect(&base, "page"), "https://a.com/page");
258    }
259}
lean_ctx/core/web/fetch.rs

lean_ctx/core/web/
fetch.rs