unobtanium 3.0.0

Opinioated Web search engine library with crawler and viewer companion.
Documentation
use mediatype::MediaTypeBuf;
use serde::{Serialize, Deserialize};

use crate::crawling::File;
use crate::summary::TextPile;
use crate::time::UtcTimestamp;
use crate::url::UrlWithoutFragment;

/// Summarizes a file along with its content
#[derive(Debug,Clone,Serialize,Deserialize)]
pub struct FileSummary {

	/// The mime-type of the file
	pub mime_type: Option<MediaTypeBuf>,
	
	/// The file size if known
	pub size: Option<u64>,

	/// The canonical URL is usually a self declaration of the resource
	pub canonical_url: Option<UrlWithoutFragment>,

	/// text summary for full text indexing
	pub text: Option<TextPile>,
	
	/// Unused: No LinkStatistics currently implemented
	pub link_stats: Option<LinkStatistics>,

	/// File metadata extracted from the content
	pub description: DocumentDescription,
}

impl FileSummary {
	pub fn from_components(
		file: File,
		document_description: DocumentDescription,
		text_pile: Option<TextPile>,
		link_stats: Option<LinkStatistics>,
	) -> FileSummary {
		return FileSummary {

			mime_type: Some(file.mime),

			size: file.size,
			canonical_url: file.canonical_url,
			
			text: text_pile,

			link_stats: link_stats,
			description: document_description,
		}
	}
}

/*
 * Statistics about incoming and outgoing Links.
 * total_incoming = incoming_external_links + mutual_external_links
 *                + incoming_internal_links + mutual_internal_links
 * total_outgoing = outgoing_external_links + mutual_external_links + known_dead_external_links
 *                + outgoing_internal_links + mutual_internal_links + known_dead_internal_links
 * 
 */
#[derive(Debug,Clone,Serialize,Deserialize,Default)]
pub struct LinkStatistics {
	incoming_internal_links: u64,
	incoming_external_links: u64,

	outgoing_internal_links: u64,
	outgoing_external_links: u64,

	mutual_internal_links: u64,
	mutual_external_links: u64,

	known_dead_internal_links: u64,
	known_dead_external_links: u64,
}

/// Metadata taken directly from the document
#[derive(Debug,Clone,Serialize,Deserialize,Default)]
pub struct DocumentDescription {
	/// The document title that is displayed to identify a document across sites
	pub title: Option<String>,

	/// The actual headline of a document.
	///
	/// For HTML this is the first `h1` element.
	pub primary_headline: Option<String>,

	/// How the document describes/summarizes itself
	pub description: Option<String>,

	/// The iso codes used to describe the document languge.
	///
	/// Examples:
	/// * `de`
	/// * `en`
	/// * `de-DE` (RFC compliant locale)
	/// * NOT `de_DE`, that's a posix locale!
	pub primary_language: Option<String>,
	
	//TODO: reintroduce author and license

	/// The time of publishing from the document metadata
	pub date_published: Option<UtcTimestamp>,

	/// The time the document was last updated according to document metadata.
	pub date_last_updated: Option<UtcTimestamp>,
	
	// TODO: date_expired

	/// Increases as more criteria for an index page are met.
	/// Decreases as more criteria against an index page are met.
	///
	/// If negative the document is probably a leaf document like an article.
	pub indexiness: i64,
}