unobtanium 3.0.0

Opinioated Web search engine library with crawler and viewer companion.
Documentation
use log::trace;
use rusqlite::Error;
use uuid::Uuid;
	
use std::collections::HashSet;

use crate::crawling::CrawlType;
use crate::crawling::ExitCode;
use crate::database::id::CrawlSummaryId;
use crate::database::id::EntityGenerationId;
use crate::database::DatabaseError;
use crate::database::Page;
use crate::database::sqlite_helper::*;
use crate::database::summary::structs::SummaryDatabase;
use crate::database::summary::structs::SummaryDatabaseTransaction;
use crate::summary::CrawlSummary;
use crate::summary::HttpSummary;
use crate::summary::WithEntityGenerationUuid;

impl SummaryDatabase {

	/// Return all crawls that contributed to a given entity generation.
	pub fn get_crawl_summaries_for_entity_generation(
		&self,
		entity_generation_id: EntityGenerationId,
		page: Page,
	) -> Result<Vec<CrawlSummary>, DatabaseError> {
		trace!("summary_db.get_crawl_summaries_for_entity_generation()");
		let mut get_crawl_summaries_statement = self.connection().prepare("
			SELECT
				crawl_summary.time_started_unix_utc,
				crawl_summary.was_robotstxt_approved,
				crawl_summary.crawl_type,
				crawl_summary.crawl_uuid,
				crawl_summary.agent_uuid,
				crawl_summary.exit_code,
				crawl_summary.time_last_modified_unix_utc,
				crawl_summary.request_duration_ms,
				http_summary.status_code,
				http_summary.etag
			FROM crawl_summary
			INNER JOIN http_summary ON
				crawl_summary.crawl_summary_id = http_summary.crawl_summary_id
			WHERE crawl_summary.entity_generation_id = ?
			LIMIT ?
			OFFSET ?
		")?;
		return get_crawl_summaries_statement.query_map(
			(entity_generation_id, page.limit(), page.offset()),
			crawl_summary_row_to_crawl_summary
		)?.map(|r| r.map_err(Into::into)).collect();
	}
	
	pub fn get_crawl_summary_by_id(
		&self,
		crawl_summary_id: CrawlSummaryId,
	) -> Result<CrawlSummary, DatabaseError> {
		trace!("summary_db.get_crawl_summary_by_id()");
		self.connection().query_row(
			"SELECT
				crawl_summary.time_started_unix_utc,
				crawl_summary.was_robotstxt_approved,
				crawl_summary.crawl_type,
				crawl_summary.crawl_uuid,
				crawl_summary.agent_uuid,
				crawl_summary.exit_code,
				crawl_summary.time_last_modified_unix_utc,
				crawl_summary.request_duration_ms,
				http_summary.status_code,
				http_summary.etag
			FROM crawl_summary
			INNER JOIN http_summary ON
				crawl_summary.crawl_summary_id = http_summary.crawl_summary_id
			WHERE crawl_summary.crawl_summary_id = ?
			", (crawl_summary_id,),
			crawl_summary_row_to_crawl_summary
		).map_err(Into::into)

	}

	/// Takes a set of crawl UUIDs and returns a set of crawl UUIDs that
	/// already exist in the database.
	pub fn test_has_crawl_summary_with_uuid_bulk(
		&self,
		crawl_uuids: &[Uuid],
	) -> Result<HashSet<Uuid>, DatabaseError> {
		// TODO: use an IN with dynamic number of arguments
		let mut statement = self.connection().prepare("
			SELECT crawl_uuid
			FROM crawl_summary
			WHERE crawl_uuid = ?
		")?;
		let mut found_uuids = HashSet::new();
		for uuid in crawl_uuids {
			for res_uuid in statement.query_map((uuid,), |row| {
				return row.get::<usize,Uuid>(0);
			})? {
				found_uuids.insert(res_uuid?);
			}
		}
		return Ok(found_uuids);
	}

}

impl SummaryDatabaseTransaction<'_> {
	pub fn add_crawl_summary_bulk(
		&mut self,
		summaries: &[WithEntityGenerationUuid<CrawlSummary>],
	) -> Result<(), DatabaseError> {
		trace!("summary_db_transaction.add_crawl_summary_bulk()");
		let mut store_crawl_statement = self.connection().prepare_cached(
			"INSERT INTO crawl_summary (
				entity_generation_id,
				was_robotstxt_approved,
				crawl_type,
				time_started_unix_utc,
				crawl_uuid,
				agent_uuid,
				exit_code,
				time_last_modified_unix_utc,
				request_duration_ms
			) VALUES (
				?,?,?,?, ?,?,?,?, ?
			)"
		)?;
		// Update first and lst seen so we don't have to do so manually
		let mut update_entity_generation_statement = self.connection().prepare_cached(
			"UPDATE entity_generation
			SET first_seen_unix_utc = min(first_seen_unix_utc, ?),
				last_seen_unix_utc = max(last_seen_unix_utc, ?)
			WHERE entity_generation_id = ?
			"
		)?;
		let mut add_http_summary_statement = self.connection().prepare_cached(
			"INSERT INTO http_summary (
				crawl_summary_id,
				status_code,
				etag
			) VALUES (
				?,?,?
			)"
		)?;
		for summary in summaries {
			let entity_generation_id = self.get_entity_generation_id(summary.entity_generation_uuid)?;
			let crawl_timestamp = summary.data.crawl_time.timestamp();
			store_crawl_statement.execute(
				(
					entity_generation_id,
					summary.data.was_robotstxt_approved,
					summary.data.crawl_type.clone().to_number(),
					crawl_timestamp,
					summary.data.crawl_uuid,
					summary.data.agent_uuid,
					summary.data.exit_code.to_number(),
					to_unix_timestamp_opt(summary.data.server_last_modified),
					summary.data.request_duration_ms,
				)
			)?;
			update_entity_generation_statement.execute(
				(
					crawl_timestamp,
					crawl_timestamp,
					entity_generation_id,
				)
			)?;
			let crawl_summary_id = self.connection().last_insert_rowid();
			if let Some(http) = &summary.data.http {
				add_http_summary_statement.execute(
					(
						crawl_summary_id,
						http.status_code,
						http.etag.clone(),
					)
				)?;
			}
		}
		return Ok(());
	}

}

fn crawl_summary_row_to_crawl_summary(
	row: &rusqlite::Row,
) -> Result<CrawlSummary, Error> {
	let http_status_code: Option<u16> = row.get(8)?;
	let http_summary = if let Some(status_code) = http_status_code {
		Some(HttpSummary{
			status_code: status_code,
			etag: row.get(9)?,
		})
	} else {
		None
	};
	Ok(CrawlSummary{
		crawl_time: from_unix_timestamp_or_epoch(row.get(0)?),
		was_robotstxt_approved: row.get(1)?,
		crawl_type: CrawlType::from_number(row.get(2)?),
		crawl_uuid: row.get(3)?,
		agent_uuid: row.get(4)?,
		exit_code: ExitCode::from_number(row.get(5)?),
		server_last_modified:
			from_unix_timestamp_opt(row.get(6)?),
		request_duration_ms: row.get(7)?,
		http: http_summary,
	})
}