use super::{ForgeApi, error, user_agent};
use isahc::ReadResponseExt;
use isahc::http::header::USER_AGENT;
use texting_robots::Robot as Restrictions;
use url::Url;
pub struct RobotsTxt {
restrictions: Option<Restrictions>,
}
impl RobotsTxt {
pub fn fetch_from(api: &dyn ForgeApi, base_url: &Url) -> Self {
match Self::_fetch(api, base_url) {
Err(err) => {
println!(
"↘⚠️ Warning: Network error while fetching robots.txt. Assuming unrestricted. Error: {err}"
);
Self::empty()
}
Ok(None) => Self::empty(), Ok(Some(restrictions)) => Self::new(restrictions),
}
}
pub fn is_restricted(&self, endpoint_url: &Url) -> bool {
match self.restrictions.as_ref() {
None => false, Some(restrictions) => !restrictions.allowed(endpoint_url.as_str()),
}
}
fn new(restrictions: Restrictions) -> Self {
Self {
restrictions: Some(restrictions),
}
}
fn empty() -> Self {
Self { restrictions: None }
}
fn _fetch(api: &dyn ForgeApi, base_url: &Url) -> Result<Option<Restrictions>, error::Error> {
if api.access_token_value().is_some() {
println!(
"Skipping robots.txt fetch; we have an API token, so we're assuming no restrictions."
);
return Ok(None);
}
if is_localhost(base_url) {
println!(
"Skipping robots.txt fetch; upstream is localhost, so we're assuming no restrictions."
);
return Ok(None);
}
let uri = base_url.join("/robots.txt").expect("Valid URL path");
println!("↗ GET HTTP {uri}");
let req = isahc::Request::get(uri.as_str())
.header(USER_AGENT, user_agent())
.body(())
.expect("Valid request");
let txt = match isahc::send(req) {
Err(err) => {
println!("Request failed due to error: {err}");
return Err(error::Error::NetworkFailure);
}
Ok(mut response) => match response.text() {
Err(err) => {
println!("Could not get text from response: {err}");
return Err(error::Error::UnexpectedResponse);
}
Ok(txt) => txt,
},
};
match Restrictions::new(user_agent(), txt.as_bytes()) {
Err(err) => {
println!(
"↘⚠️ Warning: Could not parse robots.txt. Assuming unrestricted. Error: {err}"
);
Ok(None)
}
Ok(restrictions) => Ok(Some(restrictions)),
}
}
}
fn is_localhost(url: &Url) -> bool {
url.domain().is_none_or(|domain| domain == "localhost")
}