git-gemini-forge 0.6.2

A simple Gemini server that serves a read-only view of public repositories from a Git forge.
use super::{ForgeApi, error, user_agent};
use isahc::ReadResponseExt;
use isahc::http::header::USER_AGENT;
use texting_robots::Robot as Restrictions;
use url::Url;

// TODO: Some way to safely update the restrictions document at runtime.

/// A representation of the upstream server's robots.txt file. Cloning this object holds
/// a refrence to the same underlying data.
pub struct RobotsTxt {
	/// The text of the robots.txt file, if one exists and could be parsed.
	/// Localhost hosts are assumed not to have any restrictions.
	restrictions: Option<Restrictions>,
}

impl RobotsTxt {
	/// Fetches the robots.txt file from the server. If an error is returned,
	/// we assume no restrictions.
	pub fn fetch_from(api: &Box<dyn ForgeApi>, base_url: &Url) -> Self {
		// Fetch robots.txt
		match Self::_fetch(api, base_url) {
			Err(err) => {
				println!(
					"↘⚠️ Warning: Network error while fetching robots.txt. Assuming unrestricted. Error: {err}"
				);
				Self::empty()
			}
			Ok(None) => Self::empty(), // localhost or auth'd
			Ok(Some(restrictions)) => Self::new(restrictions),
		}
	}

	/// Returns `true` if robots.txt prohibits us from visiting the URL,
	/// or `false` if this URL is presently unrestricted.
	pub fn is_restricted(&self, endpoint_url: &Url) -> bool {
		match self.restrictions.as_ref() {
			None => false, // no restrictions
			Some(restrictions) => !restrictions.allowed(endpoint_url.as_str()),
		}
	}

	/// Constructs a new [`RobotsTxt`] using the given restrictions.
	fn new(restrictions: Restrictions) -> Self {
		Self {
			restrictions: Some(restrictions),
		}
	}

	/// Constructs a new unrestricted [`RobotsTxt`].
	fn empty() -> Self {
		Self { restrictions: None }
	}

	/// Retrieves and parses the updated robots.txt file from the server at `base_url`.
	///
	/// Does nothing and returns `Ok(None)` if the URL points to localhost. We do not
	/// retrict localhost.
	fn _fetch(
		api: &Box<dyn ForgeApi>,
		base_url: &Url,
	) -> Result<Option<Restrictions>, error::Error> {
		if let Some(_) = api.access_token_value() {
			// valid auth key is not restricted
			println!(
				"Skipping robots.txt fetch; we have an API token, so we're assuming no restrictions."
			);
			return Ok(None);
		}

		if is_localhost(&base_url) {
			// localhost is not restricted
			println!(
				"Skipping robots.txt fetch; upstream is localhost, so we're assuming no restrictions."
			);
			return Ok(None);
		}

		let uri = base_url.join("/robots.txt").expect("Valid URL path");
		println!("↗ GET HTTP {uri}");
		let req = isahc::Request::get(uri.as_str())
			.header(USER_AGENT, user_agent())
			.body(())
			.expect("Valid request");
		let txt = match isahc::send(req) {
			Err(err) => {
				println!("Request failed due to error: {err}");
				return Err(error::Error::NetworkFailure);
			}
			Ok(mut response) => match response.text() {
				Err(err) => {
					println!("Could not get text from response: {err}");
					return Err(error::Error::UnexpectedResponse);
				}
				Ok(txt) => txt,
			},
		};

		// Parse the robots.txt file
		match Restrictions::new(user_agent(), txt.as_bytes()) {
			Err(err) => {
				// TODO: Maybe try a more simplified direct approach instead? Search txt for "*" or "git-gemini-forge" UA maybe?
				println!(
					"↘⚠️ Warning: Could not parse robots.txt. Assuming unrestricted. Error: {err}"
				);
				Ok(None)
			}
			Ok(restrictions) => Ok(Some(restrictions)),
		}
	}
}

/// Returns `true` if the URL is presumably localhost.
fn is_localhost(url: &Url) -> bool {
	url.domain().is_none_or(|domain| domain == "localhost")
}