unobtanium 3.0.0

Opinioated Web search engine library with crawler and viewer companion.
Documentation

//! Crawler ExitCode, Datastructure and Constants.

use serde::{Serialize,Deserialize};
use criterium::number::AsInteger;

use std::str::FromStr;

/// Exit Code used by the Crawler to roughly categorize what happened
/// and when the next recrawl should happen.
///
/// There are `ToString` and `FromStr` implementations that will
/// use the lowercased constant names as string representations
/// and plin numbers as the fallback.
#[derive(Debug,PartialEq,Eq,Copy,Clone,Serialize,Deserialize)]
pub struct ExitCode(i16);

impl ExitCode {

	/// While crawling a database error that could not be recovered from happend.
	///
	/// Fixing the underlying issue and recrawling should help.
	pub const DATABASE_ERROR: ExitCode = ExitCode(-3);

	/// The crawl was cancelled and the data in the databse is incomplete.
	///
	/// To recover: purge incomplete crawl information and rescedule.
	pub const CANCELLED: ExitCode = ExitCode(-2);

	/// An already registred crawl couldn't  find the corresponding unsceduled scheduling log entries.
	/// 
	/// This may happen after a race codition in the TODO-list.
	/// 
	/// The crawler must exit without doing anything in this case.
	pub const SOMEONE_STOLE_MY_WORK: ExitCode = ExitCode(-1);

	/// The crawl completed successfully and a corresponding file was ingested.
	pub const FILE_INGESTED: ExitCode = ExitCode(20);

	/// The crawl yielded a file of unknown type, the metadata was recorded.
	pub const FILE_OF_UNKNOWN_TYPE: ExitCode = ExitCode(29);

	/// The Server gave a response idicating that the resource will alwys redirect.
	/// 
	/// This maps to the http status codes `301` and `308`
	pub const PERMANENT_REDIRECT: ExitCode = ExitCode(31);

	/// The Server redirected the request to another URL, but the target might change in the future.
	///
	/// This maps to the http status codes `302` and `309`
	pub const REDIRECT: ExitCode = ExitCode(32);

	/// The crawler had information about a previous crawl and came to the conclusion that the file content didn't change.
	pub const FILE_DID_NOT_CHANGE: ExitCode = ExitCode(34);

	/// The Server sent out an unknown `400` range response code for http or equivalent.
	pub const SERVER_BLAMED_CLIENT: ExitCode = ExitCode(40);
	/// The Server indicated that the resource is gone and wont come back (http `410`)
	pub const FILE_GONE: ExitCode = ExitCode(41);
	/// The Server sent out a response that the crawler didn't understand.
	pub const DID_NOT_UNDERSTAND_ANSWER: ExitCode = ExitCode(42);
	/// The file was not found on the server (http `404`)
	pub const FILE_NOT_FOUND: ExitCode = ExitCode(44);
	/// We apparantly made too many requests (http `429`)
	pub const RATE_LIMITED: ExitCode = ExitCode(49);

	/// The crawler triggered an internal server error, the crawl might succeed later.
	///
	/// This roughly maps to the http `500` error range.
	pub const SERVER_INTERNAL_ERROR: ExitCode = ExitCode(50);

	/// There was an error opening the connection to the server.
	pub const CONNECTION_FAILED: ExitCode = ExitCode(100);
	/// After the connectin was open a timeout hapened.
	pub const REQUEST_TIMEOUT: ExitCode = ExitCode(101);
	/// After the connection was open an error happened while reading the response.
	pub const ERROR_READING_RESPONSE: ExitCode = ExitCode(102);

	/// At the request of the Servers `robots.txt` a resource was not crawled.
	pub const BLOCKED_BY_ROBOTS_TXT: ExitCode = ExitCode(170);

	/// At the request of the Server or the served content the crawl was aborted
	/// (i.e. through the `noindex` meta tag)
	pub const BLOCKED_AT_REQUEST_OF_REMOTE: ExitCode = ExitCode(171);

	/// A local policy on the origin level prevented the crawling of this resource
	pub const BLOCKED_ORIGIN_BY_LOCAL_POLICY: ExitCode = ExitCode(172);

	/// A local policy on the url level prevented the crawling of this resource
	pub const BLOCKED_URL_BY_LOCAL_POLICY: ExitCode = ExitCode(173);

	/// The Server returned a challenge/captcha page of some sort
	pub const BLOCKED_BY_CHALLENGE: ExitCode = ExitCode(174);


	/// The resource was dismissed as it marked itself as a non-canonical version of another resource.
	pub const NOT_CANONICAL: ExitCode = ExitCode(180);

	/// The resouce was found to be a duplicate of another resource.
	/// This may also be reassigned from a FILE_INGESTED code at a
	/// later stage since recognizing duplicates requires both resources to be present.
	pub const DUPLICATE: ExitCode = ExitCode(181);

	/// An error that doesn't have a defined exit code (yet)
	///
	/// Use sparingly and alwys try to use a more specific exit code!
	/// (Pull requests welcome!)
	///
	/// *Do **not** use as a replacement for an unknown exit code!*
	pub const UNKNOWN_ERROR: ExitCode = ExitCode(-999);

	/// Wheter the exit code represents a redirect condition.
	pub fn is_redirect(&self) -> bool {
		matches!(self,
			&Self::REDIRECT |
			&Self::PERMANENT_REDIRECT
		)
	}

	/// Wheter the reply had some kind  of content or meaningful absence of content
	/// that was evaluated according to the exit code.
	pub fn is_contentful(&self) -> bool {
		matches!(*self,
			Self::FILE_INGESTED |
			Self::FILE_OF_UNKNOWN_TYPE |
			Self::PERMANENT_REDIRECT |
			Self::REDIRECT |
			Self::FILE_NOT_FOUND |
			Self::FILE_GONE |
			Self::NOT_CANONICAL |
			Self::DUPLICATE
		)
	}

	/// Wheter indexing was blocked either at the request of the server or a local policy.
	pub fn is_blocked(&self) -> bool {
		matches!(*self,
			Self::BLOCKED_BY_ROBOTS_TXT |
			Self::BLOCKED_AT_REQUEST_OF_REMOTE |
			Self::BLOCKED_ORIGIN_BY_LOCAL_POLICY |
			Self::BLOCKED_URL_BY_LOCAL_POLICY |
			Self::BLOCKED_BY_CHALLENGE
		)
	}
	
	/// Wheter it is worth retrying the request immedeately.
	///
	/// Important: Make sure you're not hammering the target system with retries!
	pub fn could_be_a_fluke(&self) -> bool {
		matches!(*self,
			Self::UNKNOWN_ERROR |
			Self::CONNECTION_FAILED |
			Self::REQUEST_TIMEOUT |
			Self::ERROR_READING_RESPONSE
		)
	}
}

impl ToString for ExitCode {
	fn to_string(&self) -> String {
		match *self {
			Self::DATABASE_ERROR => "database_error",
			Self::CANCELLED => "cancelled",
			Self::SOMEONE_STOLE_MY_WORK => "someone_stole_my_work",
			Self::FILE_INGESTED => "file_ingested",
			Self::FILE_OF_UNKNOWN_TYPE => "file_of_unknown_type",
			Self::PERMANENT_REDIRECT => "permanent_redirect",
			Self::REDIRECT => "redirect",
			Self::FILE_DID_NOT_CHANGE => "file_did_not_change",
			Self::SERVER_BLAMED_CLIENT => "server_blamed_client",
			Self::FILE_GONE => "file_gone",
			Self::DID_NOT_UNDERSTAND_ANSWER => "did_not_understand_answer",
			Self::FILE_NOT_FOUND => "file_not_found",
			Self::RATE_LIMITED => "rate_limited",
			Self::SERVER_INTERNAL_ERROR => "server_internal_error",
			Self::CONNECTION_FAILED => "connection_failed",
			Self::REQUEST_TIMEOUT => "request_timeout",
			Self::ERROR_READING_RESPONSE => "error_reading_response",
			Self::BLOCKED_BY_ROBOTS_TXT => "blocked_by_robots_txt",
			Self::BLOCKED_AT_REQUEST_OF_REMOTE => "blocked_at_request_of_remote",
			Self::BLOCKED_ORIGIN_BY_LOCAL_POLICY => "blocked_origin_by_local_policy",
			Self::BLOCKED_URL_BY_LOCAL_POLICY => "blocked_url_by_local_policy",
			Self::BLOCKED_BY_CHALLENGE => "blocked_by_challenge",
			Self::NOT_CANONICAL => "not_canonical",
			Self::DUPLICATE => "duplicate",
			Self::UNKNOWN_ERROR => "unknown_error",
			// just convert to a number.
			_ => { return self.0.to_string(); }
		}.to_string()
	}
}

impl FromStr for ExitCode {
	type Err = &'static str;
	
	fn from_str(s: &str) -> Result<Self, Self::Err> {
		match s {
			"database_error" => Ok(Self::DATABASE_ERROR),
			"cancelled" => Ok(Self::CANCELLED),
			"someone_stole_my_work" => Ok(Self::SOMEONE_STOLE_MY_WORK),
			"file_ingested" => Ok(Self::FILE_INGESTED),
			"file_of_unknown_type" => Ok(Self::FILE_OF_UNKNOWN_TYPE),
			"permanent_redirect" => Ok(Self::PERMANENT_REDIRECT),
			"redirect" => Ok(Self::REDIRECT),
			"file_did_not_change" => Ok(Self::FILE_DID_NOT_CHANGE),
			"server_blamed_client" => Ok(Self::SERVER_BLAMED_CLIENT),
			"file_gone" => Ok(Self::FILE_GONE),
			"did_not_understand_answer" => Ok(Self::DID_NOT_UNDERSTAND_ANSWER),
			"file_not_found" => Ok(Self::FILE_NOT_FOUND),
			"rate_limited" => Ok(Self::RATE_LIMITED),
			"server_internal_error" => Ok(Self::SERVER_INTERNAL_ERROR),
			"connection_failed" => Ok(Self::CONNECTION_FAILED),
			"request_timeout" => Ok(Self::REQUEST_TIMEOUT),
			"error_reading_response" => Ok(Self::ERROR_READING_RESPONSE),
			"blocked_by_robots_txt" => Ok(Self::BLOCKED_BY_ROBOTS_TXT),
			"blocked_at_request_of_remote" => Ok(Self::BLOCKED_AT_REQUEST_OF_REMOTE),
			"blocked_origin_by_local_policy" => Ok(Self::BLOCKED_ORIGIN_BY_LOCAL_POLICY),
			"blocked_url_by_local_policy" => Ok(Self::BLOCKED_URL_BY_LOCAL_POLICY),
			"blocked_by_challenge" => Ok(Self::BLOCKED_BY_CHALLENGE),
			"not_canonical" => Ok(Self::NOT_CANONICAL),
			"duplicate" => Ok(Self::DUPLICATE),
			"unknown_error" => Ok(Self::UNKNOWN_ERROR),
			_ => {
				if let Ok(code) = i16::from_str(s) {
					Ok(Self(code))
				} else {
					Err("Not a recognized crawler exit code! Make sure it is in lower_snake_case or number in the i16 range.")
				}
			},
		}
	}
}

impl From<ExitCode> for i16 {
	fn from(exit_code: ExitCode) -> i16 {
		exit_code.0
	}
}

impl From<i16> for ExitCode {
	fn from(n: i16) -> Self {
		Self(n)
	}
}

impl ExitCode {
	/// Convert an i16 to an exit code
	pub fn from_number(n: i16) -> Self {
		n.into()
	}

	/// convert an optional i16 to an optional exit code
	pub fn from_number_opt(n: Option<i16>) -> Option<Self> {
		n.map(|n| n.into())
	}

	/// Convert an exit code to an i16
	pub fn to_number(self) -> i16 {
		self.into()
	}
}

impl AsInteger for ExitCode {
	fn as_criterium_i64(&self) -> i64 {
	    self.to_number() as i64
	}
}