unobtanium 3.0.0

Opinioated Web search engine library with crawler and viewer companion.
Documentation
use criterium::number::AsInteger;
use serde::{Serialize,Deserialize};
use url::Url;

use crate::content::InformationSource;
use crate::content::LocationSignature;

/// An Element with some restrictions to represent a link
#[derive(Clone,Debug,Serialize,Deserialize)]
pub struct Link {
	/// The url the link leads to
	pub to_url: Url,
	/// Where in the file this element is.
	pub source: InformationSource,
	/// The Label that was on the link
	pub text: Option<String>,
	/// A place to store contents of the `rel` attribute
	pub rel: Vec<String>,
	/// the html tag name of the element that generated this link or its equivalent
	pub html_tag_name: String,
	/// Which sementically relevant parents the element had
	pub location_signature: LocationSignature,
	/// How the link destination relates to the document the link is in
	pub link_locality: LinkLocality,
	/// Wheter the link contains a headline element in its text.
	pub contains_headline: bool,
}

/// Describes the link destinations relation to the page it is on.
///
/// A <i>site</i> is by default equivalent to a hostname.
///
/// Exceptions:
/// * A site that hosts different sites on subpaths, protocols or ports
/// * Multiple hosts might be considered hostig the same Site
#[repr(u8)]
#[derive(Clone,Debug,PartialEq,Eq,Serialize,Deserialize)]
pub enum LinkLocality {
	/// Self-Links link inside the document and only change the fragment
	SelfLink = 0,
	/// In-Links link to a different document on the same <i>site</i>
	InLink = 1,
	/// Out-Links leave the <i>site</i> they are on. It is a safe fallback and returned by `Default::default()`.
	OutLink = 2,
}

impl Default for LinkLocality {
	fn default() -> Self {
		Self::OutLink
	}
}

impl LinkLocality {

	/// A classifier that implements some sane defaults.
	///
	/// It is intented to produce okay results without any configuration.
	///
	/// Rules:
	/// * link_url without host => OutLink
	/// * Different host => OutLink
	/// * Different port => OutLink
	/// * Different scheme after removing trailing 's' => Outlink
	/// * Different path => InLink
	/// * Different query => InLink
	/// * Else => SelfLink
	///
	/// Wrapping this function to produce configurable and more sophisticated behavior is encouraged.
	pub fn new(link_url: &Url, document_url: &Url) -> Self {
		// No host in link => outlink
		if !link_url.has_host() {
			return Self::OutLink;
		}
		// different host => outlink
		if link_url.host() != document_url.host() {
			return Self::OutLink;
		}

		// different ports => outlink
		// 
		// This does not consider explicit default ports
		// equivalent to implicit default ports.
		// This flaw is accepted for performance reasons.
		// Outlink is marked as a sane fallback.
		if link_url.port() != document_url.port() {
			return Self::OutLink;
		}
		
		let link_scheme = link_url.scheme();
		let document_scheme = document_url.scheme();
		// if scheme is different (trailing s not considered (i.e. http == https))
		// => outlink
		if link_scheme != document_scheme &&
			link_scheme.strip_suffix('s').unwrap_or(link_scheme)
				!= document_scheme.strip_suffix('s').unwrap_or(document_scheme)
		{
			return Self::OutLink;
		}

		/* Same site links continue being processed here. */

		// Different path => inlink
		if link_url.path() != document_url.path() {
			return Self::InLink;
		}

		// Query isn't the same => inlink
		//
		// This could be made smarter to recognize various common
		// parameters, like for Apache index sorting, etc. but that
		// is outside of this functions scope.
		if link_url.query() != document_url.query() {
			return Self::InLink;
		}

		return Self::SelfLink;

	}
	
	/// A guaranteed to succeed number to enum converter,
	/// that converts all unknown numbers to outlinks
	pub fn from_u8_or_outlink(id: u8) -> Self {
		match id {
			0 => Self::SelfLink,
			1 => Self::InLink,
			_ => Self::OutLink,
		}
	}
}

impl AsInteger for LinkLocality {
	fn as_criterium_i64(&self) -> i64 {
		self.clone() as i64
	}
}