unobtanium 3.0.0

Opinioated Web search engine library with crawler and viewer companion.
Documentation

use criterium::BooleanCriterium;
use criterium::DirectMatch;
use criterium::NumberCriterium;
use criterium::rusqlite::assembler::*;
use serde::{Serialize,Deserialize};
use uuid::Uuid;

use crate::criterium::UuidCriterium;
use crate::criterium::UrlCriterium;
use crate::database::fields::*;
use crate::database::SummarySchema;
use crate::summary::EntityGeneration;
use crate::time::UtcTimestamp;

/// For matching against direct entity generation metadata.
///
/// See also: [EntityGeneration]
#[derive(Clone,Debug,Serialize,Deserialize)]
#[serde(rename_all="snake_case")]
pub enum EntityGenerationCriterium {
	/// Match entity genrations with the given URL constraint
	Url(UrlCriterium),

	/// Exact match on against the entity generation uuid
	Uuid(UuidCriterium),

	/// Match against the time this entity generation was first seen
	FirstSeen(NumberCriterium<UtcTimestamp>),

	/// Match agains the last lime the entity generation was seen
	LastSeen(NumberCriterium<UtcTimestamp>),

	/// Match against the time the end of this entity generation was confirmed.
	///
	/// If it is `None`, this means that no information is here about the end
	/// of this entity generation, meaning it can be considered alive.
	TimeEndConfirmed(NumberCriterium<UtcTimestamp>),

	/// Match against the boolean duplicate marker of an entity generation
	/// to do a quick check on its duplicate status without the
	/// need to join in the `duplicate_summary` table.
	///
	/// This relies on the marker being updated by the database code.
	MarkedDuplicate(BooleanCriterium),

	/// Match only the latest generation of an entity in a given collection.
	/// For the DirectMatch trait this always evaluates to `None`.
	///
	/// Note: You might actually want to test for
	/// `TimeEndConfirmed(NumberCriterium::IsNone)`, as that signls that
	/// the entity generation is still alive.
	/// Being still alive also implies being the newest,
	/// as it is very unlikely that information from
	/// the future got into your database.
	///
	/// This one is also pretty expensive, you probably want to avoid it if possible.
	IsNewest,

	/// Match against the entity generation lifetime,
	/// computed as `last_seen - first_seen`.
	KnownLifetimeSeconds(NumberCriterium<i64>),
}

impl AssembleRusqliteQuery<SummarySchema, ()> for EntityGenerationCriterium {
	fn assemble_rusqlite_query(
		&self,
		assembly_context: &AssemblyContext,
		_context: &()
	) -> InvertableRusqliteQuery<SummarySchema> {
		match self {
			Self::Url(c) =>
				c.assemble_rusqlite_query_with_fragment(
					&assembly_context.prefix_with("eg_url_"),
					assembly_context,
					&EntityGenerationField::UrlFragment.into(),
				).inner_join(
					None,
					UrlField::UrlId.into(),
					Some(assembly_context.prefix()),
					EntityGenerationField::UrlId.into()
				),
			Self::Uuid(c) =>
				c.assemble_query(
					assembly_context,
					&EntityGenerationField::EntityGenerationUuid.into()
				),
			Self::FirstSeen(c) =>
				c.assemble_query(
					assembly_context,
					&EntityGenerationField::FirstSeenUnixUtc.into()
				),
			Self::LastSeen(c) =>
				c.assemble_query(
					assembly_context,
					&EntityGenerationField::FirstSeenUnixUtc.into()
				),
			Self::TimeEndConfirmed(c) =>
				c.assemble_query(
					assembly_context,
					&EntityGenerationField::ConfirmedEndUnixUtc.into()
				),
			Self::MarkedDuplicate(c) =>
				c.assemble_query(
					assembly_context,
					&EntityGenerationField::MarkedDuplicate.into()
				),
			Self::IsNewest =>
				RusqliteQuery::from_static_sql(format!(
					"{0}entity_generation.first_seen_unix_utc =
						(
							SELECT MAX({1}entity_generation.first_seen_unix_utc)
							FROM entity_generation AS {1}entity_generation
							WHERE {1}entity_generation.url_id =
								{0}entity_generation.url_id
							AND {1}entity_generation.url_fragment =
								{0}entity_generation.url_fragment
						)",
					assembly_context.prefix().to_string(),
					assembly_context.prefix().with("eg_newest_").to_string(),
				)).as_invertable(),
			Self::KnownLifetimeSeconds(c) =>
				c.assemble_rusqlite_query_with_subquery(
					assembly_context.prefix().clone(),
					format!(
						"({0}entity_generation.last_seen_unix_utc - {0}entity_generation.first_seen_unix_utc)",
						assembly_context.prefix().to_string()
					).as_str(),
					Vec::new()
				).as_invertable(),
		}
	}
}

impl DirectMatch<EntityGeneration> for EntityGenerationCriterium {

	type Output = Option<bool>;
	
	fn criterium_match(&self, data: &EntityGeneration) -> Self::Output {
		match self {
			Self::Url(c)         => c.criterium_match(&data.url).into(),
			Self::Uuid(c)        => c.criterium_match(&data.uuid).into(),
			Self::FirstSeen(c)   => c.criterium_match(&Some(data.first_seen)).into(),
			Self::LastSeen(c)    => c.criterium_match(&Some(data.last_seen)).into(),
			Self::TimeEndConfirmed(c) => c.criterium_match(&data.time_end_confirmed).into(),
			Self::MarkedDuplicate(c) => c.criterium_match(&data.marked_duplicate).into(),
			Self::IsNewest       => None,
			Self::KnownLifetimeSeconds(c) =>
				c.criterium_match(&(data.last_seen - data.first_seen).num_seconds()).into(),
		}
	}
}

impl DirectMatch<Uuid> for EntityGenerationCriterium {

	type Output = Option<bool>;
	
	fn criterium_match(&self, data: &Uuid) -> Self::Output {
	    match self {
		    Self::Uuid(c) => c.criterium_match(data).into(),
		    Self::IsNewest => None,
		    _ => Some(false),
	    }
	}
}