unobtanium 3.0.0

Opinioated Web search engine library with crawler and viewer companion.
Documentation
use serde::{Serialize,Deserialize};

use crate::crawling::ExitCode;
use crate::time::UtcTimestamp;
use crate::url::UrlWithoutFragment;

/// Metadata for URLs that is useful for making scheduling decisions
/// when crawling.
#[derive(Clone,Debug,Serialize,Deserialize)]
pub struct CrawlCandidate {

	/// The url to be crawled
	pub url: UrlWithoutFragment,

	/// If the url was ever crawled this contains the last time the crawl happend
	pub last_crawl_time: Option<UtcTimestamp>,

	/// If the last crawl finished, this is it's exit code.
	/// This field must be udpated when a new `last_crawl_time` is set
	pub last_crawl_exit_code: Option<ExitCode>,

	/// The timestamp of the last time the content or redirect of
	/// this URL was sucessfully crawled.
	///
	/// This must be updated together with the `last_contentful_http_etag` field.
	pub last_contentful_crawl_time: Option<UtcTimestamp>,

	/// If the last crawl yielded an ETag, this field contains it.
	/// 
	/// This fild must be updated together with the `last_contentful_crawl_time` field.
	pub last_contentful_http_etag: Option<String>,
}

/// Describes possible updates to the [CrawlCandidate][CrawlCandidate] data structure
/// allowing only valid combinations of modifications.
///
/// The URL the update applies to has to be provided seperately.
#[derive(Clone,Debug,Serialize,Deserialize)]
pub enum CrawlCandidateUpdate {
	/// A Crawl that didn't evaluate the content of an URL.
	///
	/// Possible Reasons:
	/// * The Crawler was only checking if the link is alive
	/// * The connection errored
	/// * The content was the same as last time
	/// * Other valid erasons probably exist.
	ContentlessCrawl {

		/// When the crawl happened.
		time: UtcTimestamp,

		/// If the crawl finised, its exit code
		exit_code: Option<ExitCode>,
	},

	/// A crawl that evaluated the content of the URL
	///
	/// Possible reasons:
	/// * The URL was fetched and file ingested
	/// * The URL redirected to somewhere else
	/// * The server was explicit about there being no accessible content
	/// * Other valid erasons probably exist.
	ContentfulCrawl {

		/// When the crawl happend
		time: UtcTimestamp,

		/// The crawl exit code
		exit_code: ExitCode,

		/// The http ETag if the server provided one.
		http_etag: Option<String>,
	},
}

impl CrawlCandidate {
	pub fn new(url: UrlWithoutFragment) -> Self {
		Self {
			url: url,
			last_crawl_time: None,
			last_crawl_exit_code: None,
			last_contentful_crawl_time: None,
			last_contentful_http_etag: None,
		}
	}

	pub fn apply_update(&mut self, update: CrawlCandidateUpdate) {
		match update {
			CrawlCandidateUpdate::ContentlessCrawl{time, exit_code} => {
				self.last_crawl_time = Some(time);
				self.last_crawl_exit_code = exit_code;
			},
			CrawlCandidateUpdate::ContentfulCrawl{time, exit_code, http_etag} => {
				self.last_crawl_time = Some(time);
				self.last_crawl_exit_code = Some(exit_code);
				self.last_contentful_crawl_time = Some(time);
				self.last_contentful_http_etag = http_etag;
			},
		}
	}
}