use super::types::{FetchError, ResponseMeta};
const MAX_META_REDIRECTS: usize = 3;
pub async fn fetch_html_inner(
client: &reqwest::Client,
url: &str,
) -> Result<(String, ResponseMeta), FetchError> {
let mut current_url = url.to_owned();
for _ in 0..=MAX_META_REDIRECTS {
let resp = client
.get(¤t_url)
.send()
.await
.map_err(|e| FetchError {
error: format!("failed to fetch {current_url}: {e}"),
url: Some(current_url.clone()),
})?;
let meta = ResponseMeta {
status_code: Some(resp.status().as_u16()),
content_type: resp
.headers()
.get(reqwest::header::CONTENT_TYPE)
.and_then(|v| v.to_str().ok())
.map(str::to_owned),
};
let body = resp.text().await.map_err(|e| FetchError {
error: format!("failed to read response body: {e}"),
url: Some(current_url.clone()),
})?;
if let Some(target) = extract_meta_refresh(&body, ¤t_url) {
current_url = target;
continue;
}
return Ok((body, meta));
}
Err(FetchError {
error: format!("too many meta-refresh redirects (max {MAX_META_REDIRECTS})"),
url: Some(current_url),
})
}
fn extract_meta_refresh(html: &str, base_url: &str) -> Option<String> {
let doc = scraper::Html::parse_document(html);
let sel = scraper::Selector::parse("meta[http-equiv=\"refresh\" i]").ok()?;
let meta = doc.select(&sel).next()?;
let content = meta.value().attr("content")?;
let lower = content.to_ascii_lowercase();
let url_start = lower.find("url=")?;
let raw_target = content[url_start + 4..].trim().trim_matches(['"', '\'']);
if raw_target.is_empty() {
return None;
}
url::Url::parse(base_url).map_or_else(
|_| Some(raw_target.to_owned()),
|base| base.join(raw_target).ok().map(|u| u.to_string()),
)
}