Skip to main content

chaser_util/
proxy.rs

1//! Shared proxy / HTTP utilities.
2//!
3//! `ProxyMode`, `ProxyConnector`, and the low-level `get` / `post` helpers
4//! are defined here once and re-exported to every scraper module.
5//! This eliminates the four near-identical copies that existed before.
6
7use bytes::Bytes;
8use http_body_util::{BodyExt, Empty};
9use hyper::Request;
10use hyper_util::client::legacy::connect::HttpConnector;
11use hyper_util::client::legacy::Client;
12use hyper_util::client::proxy::matcher::Matcher;
13use hyper_util::rt::TokioExecutor;
14
15// ----------------------------------------------------------------
16// ProxyMode
17// ----------------------------------------------------------------
18
19/// How to route outbound HTTP requests.
20#[derive(Debug, Clone)]
21pub enum ProxyMode {
22    /// Detect proxy automatically (env vars → OS settings → direct).
23    Auto,
24    /// Always connect directly; ignore any system proxy.
25    Direct,
26    /// Use the supplied proxy URI (e.g. `"http://192.168.1.1:8080"`).
27    Manual(String),
28}
29
30impl ProxyMode {
31    /// Build a `ProxyMode` from an `Option<&str>`:
32    /// - `None`       → `Auto`
33    /// - `Some("")`   → `Direct`
34    /// - `Some(uri)`  → `Manual(uri)`
35    pub fn from_option(opt: Option<&str>) -> Self {
36        match opt {
37            None => Self::Auto,
38            Some(s) if s.is_empty() => Self::Direct,
39            Some(s) => Self::Manual(s.to_string()),
40        }
41    }
42}
43
44// ----------------------------------------------------------------
45// ProxyConnector
46// ----------------------------------------------------------------
47
48/// A hyper `Connector` that always dials a fixed upstream proxy host:port,
49/// regardless of the target URI.  The real target URI is carried in the
50/// `Host` header (HTTP/1.1 CONNECT-less plain-HTTP proxy protocol).
51#[derive(Clone)]
52pub struct ProxyConnector {
53    inner:      HttpConnector,
54    proxy_host: String,
55    proxy_port: u16,
56}
57
58impl ProxyConnector {
59    pub fn new(proxy_host: impl Into<String>, proxy_port: u16) -> Self {
60        let mut inner = HttpConnector::new();
61        inner.enforce_http(false);
62        Self { inner, proxy_host: proxy_host.into(), proxy_port }
63    }
64}
65
66impl tower_service::Service<http::Uri> for ProxyConnector {
67    type Response = <HttpConnector as tower_service::Service<http::Uri>>::Response;
68    type Error    = <HttpConnector as tower_service::Service<http::Uri>>::Error;
69    type Future   = <HttpConnector as tower_service::Service<http::Uri>>::Future;
70
71    fn poll_ready(
72        &mut self,
73        cx: &mut std::task::Context<'_>,
74    ) -> std::task::Poll<Result<(), Self::Error>> {
75        self.inner.poll_ready(cx)
76    }
77
78    fn call(&mut self, _uri: http::Uri) -> Self::Future {
79        let proxy_uri: http::Uri =
80            format!("http://{}:{}", self.proxy_host, self.proxy_port)
81                .parse()
82                .unwrap_or_else(|_| http::Uri::from_static("http://127.0.0.1:8080"));
83        self.inner.call(proxy_uri)
84    }
85}
86
87// ----------------------------------------------------------------
88// Standard User-Agent string
89// ----------------------------------------------------------------
90
91/// Use a realistic browser UA everywhere so the target server does not
92/// reject requests.  (Previously `room_list.rs` used a different string,
93/// causing inconsistent behaviour.)
94pub const USER_AGENT: &str =
95    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
96     AppleWebKit/537.36 (KHTML, like Gecko) \
97     Chrome/124.0.0.0 Safari/537.36";
98
99// ----------------------------------------------------------------
100// Shared error type alias
101// ----------------------------------------------------------------
102
103pub type BoxError = Box<dyn std::error::Error + Send + Sync>;
104
105// ----------------------------------------------------------------
106// send_once  (single request, no redirect following)
107// ----------------------------------------------------------------
108
109/// Send a single GET request and return `(status, headers, body)`.
110///
111/// `extra` is a slice of additional `(header-name, value)` pairs appended
112/// to every request.
113pub async fn send_once(
114    url:        &str,
115    extra:      &[(&'static str, String)],
116    proxy_mode: &ProxyMode,
117) -> Result<(u16, hyper::HeaderMap, Bytes), BoxError> {
118    let target_uri: http::Uri = url.parse()?;
119
120    // Macro avoids repeating the header-appending loop for every proxy branch.
121    macro_rules! build_req {
122        ($builder:expr) => {{
123            let mut b = $builder;
124            for (k, v) in extra {
125                b = b.header(*k, v.as_str());
126            }
127            b.body(Empty::<Bytes>::new())?
128        }};
129    }
130
131    macro_rules! base_builder {
132        ($url:expr) => {
133            Request::builder()
134                .method("GET")
135                .uri($url)
136                .header("User-Agent", USER_AGENT)
137        };
138    }
139
140    let resp = match proxy_mode {
141        ProxyMode::Auto => {
142            let matcher = Matcher::from_system();
143            if let Some(intercept) = matcher.intercept(&target_uri) {
144                let ph = intercept.uri().host().unwrap_or("127.0.0.1").to_string();
145                let pp = intercept.uri().port_u16().unwrap_or(8080);
146                let client = Client::builder(TokioExecutor::new())
147                    .build::<_, Empty<Bytes>>(ProxyConnector::new(ph, pp));
148                let mut b = base_builder!(url);
149                if let Some(auth) = intercept.basic_auth() {
150                    b = b.header("Proxy-Authorization", auth);
151                }
152                client.request(build_req!(b)).await?
153            } else {
154                let mut conn = HttpConnector::new();
155                conn.enforce_http(false);
156                let client = Client::builder(TokioExecutor::new())
157                    .build::<_, Empty<Bytes>>(conn);
158                client.request(build_req!(base_builder!(url))).await?
159            }
160        }
161
162        ProxyMode::Direct => {
163            let mut conn = HttpConnector::new();
164            conn.enforce_http(false);
165            let client = Client::builder(TokioExecutor::new())
166                .build::<_, Empty<Bytes>>(conn);
167            client.request(build_req!(base_builder!(url))).await?
168        }
169
170        ProxyMode::Manual(proxy_uri_str) => {
171            let proxy_uri: http::Uri = proxy_uri_str.parse()?;
172            let ph = proxy_uri.host().unwrap_or("127.0.0.1").to_string();
173            let pp = proxy_uri.port_u16().unwrap_or(8080);
174            let client = Client::builder(TokioExecutor::new())
175                .build::<_, Empty<Bytes>>(ProxyConnector::new(ph, pp));
176            client.request(build_req!(base_builder!(url))).await?
177        }
178    };
179
180    let status  = resp.status().as_u16();
181    let headers = resp.headers().clone();
182    let body    = resp.into_body().collect().await?.to_bytes();
183    Ok((status, headers, body))
184}
185
186// ----------------------------------------------------------------
187// send_follow_redirects
188// ----------------------------------------------------------------
189
190/// Follow up to `MAX_REDIRECTS` HTTP 3xx responses automatically.
191///
192/// Returns `(final_body, Option<jsessionid>)`.
193/// The JSESSIONID is collected from `Set-Cookie` headers on any hop.
194///
195/// Relative `Location` values are resolved against the current URL so that
196/// servers returning `/path?foo=bar` instead of an absolute URI are handled
197/// correctly.
198pub async fn send_follow_redirects(
199    start_url:  &str,
200    extra:      &[(&'static str, String)],
201    proxy_mode: &ProxyMode,
202) -> Result<(Bytes, Option<String>), BoxError> {
203    const MAX_REDIRECTS: usize = 10;
204
205    let mut url      = start_url.to_string();
206    let mut jsession = None::<String>;
207
208    for _ in 0..MAX_REDIRECTS {
209        let (status, headers, body) = send_once(&url, extra, proxy_mode).await?;
210
211        // Harvest JSESSIONID from every hop
212        for val in headers.get_all("set-cookie").iter() {
213            for part in val.to_str().unwrap_or("").split(';') {
214                if let Some(id) = part.trim().strip_prefix("JSESSIONID=") {
215                    jsession = Some(id.to_string());
216                }
217            }
218        }
219
220        if (300..400).contains(&status) {
221            if let Some(loc) = headers.get("location") {
222                let loc_str = loc.to_str()?;
223                // Resolve relative Location against current URL
224                url = resolve_url(&url, loc_str)?;
225                continue;
226            }
227        }
228
229        return Ok((body, jsession));
230    }
231
232    Err("too many redirects".into())
233}
234
235// ----------------------------------------------------------------
236// URL helpers
237// ----------------------------------------------------------------
238
239/// Resolve `location` (possibly relative) against `base`.
240///
241/// Examples:
242///   resolve("http://host/a/b", "/c/d")   → "http://host/c/d"
243///   resolve("http://host/a/b", "c/d")    → "http://host/a/c/d"
244///   resolve("http://host/a/b", "http://other/x") → "http://other/x"
245pub fn resolve_url(base: &str, location: &str) -> Result<String, BoxError> {
246    // Already absolute
247    if location.starts_with("http://") || location.starts_with("https://") {
248        return Ok(location.to_string());
249    }
250
251    // Parse base to extract scheme + authority
252    let base_uri: http::Uri = base.parse()?;
253    let scheme    = base_uri.scheme_str().unwrap_or("http");
254    let authority = base_uri.authority().map(|a| a.as_str()).unwrap_or("");
255
256    if location.starts_with('/') {
257        // Absolute path
258        Ok(format!("{}://{}{}", scheme, authority, location))
259    } else {
260        // Relative path: resolve against base path's directory
261        let base_path = base_uri.path();
262        let dir = match base_path.rfind('/') {
263            Some(i) => &base_path[..=i],
264            None    => "/",
265        };
266        Ok(format!("{}://{}{}{}", scheme, authority, dir, location))
267    }
268}
269
270// ----------------------------------------------------------------
271// URL encoding helper
272// ----------------------------------------------------------------
273
274/// Percent-encode a string for use as a query parameter value.
275/// Uses `percent-encoding` crate with `NON_ALPHANUMERIC` set (safe for all values).
276pub fn url_encode(s: &str) -> String {
277    percent_encoding::utf8_percent_encode(s, percent_encoding::NON_ALPHANUMERIC)
278        .to_string()
279}
280
281// ----------------------------------------------------------------
282// JSESSIONID extraction
283// ----------------------------------------------------------------
284
285/// Extract JSESSIONID from a header map's `Set-Cookie` values.
286pub fn extract_jsessionid(headers: &hyper::HeaderMap) -> Option<String> {
287    for val in headers.get_all("set-cookie").iter() {
288        for part in val.to_str().unwrap_or("").split(';') {
289            if let Some(id) = part.trim().strip_prefix("JSESSIONID=") {
290                return Some(id.to_string());
291            }
292        }
293    }
294    None
295}