unobtanium 3.0.0

Opinioated Web search engine library with crawler and viewer companion.
Documentation
use log::{trace, info};

use crate::database::DatabaseError;
use crate::database::DatabaseOpeningError;
use crate::database::summary::structs::SummaryDatabase;

impl SummaryDatabase {

	/// Initalizes summary database specific tables that don't belong to the fts5 search implementation.
	pub fn initalize_summary_database(&self) -> Result<(), DatabaseError> {
		self.base().assert_writable("initalize_summary_database")?;
		// Initalize base database
		self.base().initalize_base_database()?;

		info!("Initalizing database (summary part) ...");

		info!("Table: entity_generation ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS entity_generation (
				entity_generation_id INTEGER NOT NULL PRIMARY KEY,
				url_id INTEGER NOT NULL,
				first_seen_unix_utc INTEGER NOT NULL,
				last_seen_unix_utc INTEGER NOT NULL,
				confirmed_end_unix_utc INTEGER NULL,
				marked_duplicate BOOL NOT NULL DEFAULT 0, -- Follows the duplicate_summary table
				entity_generation_uuid BLOB(16) UNIQUE NOT NULL,
				url_fragment TEXT NULL,
				text_pile_id INTEGER NULL,
				CHECK(first_seen_unix_utc <= last_seen_unix_utc)
			);"
		,())?;

		info!("Index: entity_generation_by_uuid ...");
		self.connection().execute("
			CREATE INDEX IF NOT EXISTS entity_generation_by_uuid
			ON entity_generation(entity_generation_uuid);
		",())?;

		info!("Index: entity_generation_by_first_seen ...");
		self.connection().execute("
			CREATE INDEX IF NOT EXISTS entity_generation_by_first_seen
			ON entity_generation(url_id,url_fragment,first_seen_unix_utc);
		",())?;

		// Speeds up looking for non-expired entity generations given a text_pile_id
		info!("Index: entity_generation_by_text_pile ...");
		self.connection().execute("
			CREATE INDEX IF NOT EXISTS entity_generation_by_text_pile
			ON entity_generation(text_pile_id,confirmed_end_unix_utc);
		",())?;

		info!("Index: entity_generation_by_active ...");
		self.connection().execute("
			CREATE INDEX IF NOT EXISTS entity_generation_by_text_pile
			ON entity_generation(marked_duplicate, confirmed_end_unix_utc);
		",())?;


		info!("Table: duplicate_summary ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS duplicate_summary (
				subject_entity_generation_id INTEGER NOT NULL,
				duplicate_of_entity_generation_id INTEGER NOT NULL,
				duplicate_status_start_unix_utc INTEGER NOT NULL, -- do not forget to update entity_generation.marked_duplicate
				duplicate_status_end_unix_utc INTEGER NULL
			);"
		,())?;

		// Speeds up looking if some entity generation is a non-expired duplicate
		info!("Index: duplicate_summary_by_duplicate ...");
		self.connection().execute("
			CREATE INDEX IF NOT EXISTS duplicate_summary_by_duplicate
			ON duplicate_summary(subject_entity_generation_id, duplicate_status_end_unix_utc);
		",())?;

		info!("Table: crawl_summary ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS crawl_summary (
				crawl_summary_id INTEGER NOT NULL PRIMARY KEY,
				entity_generation_id INTEGER NOT NULL,
				was_robotstxt_approved BOOL NOT NULL,
				crawl_type INTEGER NOT NULL,
				crawl_uuid BLOB(16) NOT NULL,
				agent_uuid BLOB(16) NOT NULL,
				time_started_unix_utc INTEGER NOT NULL,
				exit_code INTEGER NOT NULL,
				time_last_modified_unix_utc INTEGER NULL,
				request_duration_ms INTEGER NULL
			);"
		,())?;

		// Speeds up looking for already integrated craws via test_has_crawl_summary_with_uuid_bulk()
		info!("Index: crawl_summary_by_crawl_uuid ...");
		self.connection().execute("
			CREATE INDEX IF NOT EXISTS crawl_summary_by_crawl_uuid
			ON crawl_summary(crawl_uuid);
		",())?;


		info!("Table: http_summary ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS http_summary (
				crawl_summary_id INTEGER NOT NULL PRIMARY KEY,
				status_code INTEGER NOT NULL,
				etag TEXT NULL
			);"
		,())?;

		info!("Table: file_summary ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS file_summary (
				entity_generation_id INTEGER NOT NULL PRIMARY KEY,
				-- file size is allowed to be null when the file wasn't fully fetched.
				file_size INTEGER NULL,
				mimetype_id INTEGER NULL,
				canonical_url_id INTEGER NULL,
				UNIQUE(entity_generation_id)
			);"
		,())?;

		info!("Table: redirect_summary ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS redirect_summary (
				entity_generation_id INTEGER NOT NULL PRIMARY KEY,
				-- redirect specific fields
				to_url_id INTEGER NOT NULL,
				information_source INTEGER NOT NULL,
				is_permanent BOOL NOT NULL,
				by_security_policy BOOL NOT NULL,
				to_url_fragment VARCHAR(255) NULL
			);"
		,())?;
		
		info!("Table: link_summary ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS link_summary (
				link_summary_id INTEGER NOT NULL PRIMARY KEY,
				entity_generation_id INTEGER NOT NULL,
				link_to_url INTEGER NOT NULL,
				rel_nofollow BOOL NOT NULL DEFAULT 0,
				rel_me BOOL NOT NULL DEFAULT 0,
				rel_tag BOOL NOT NULL DEFAULT 0,
				in_header BOOL NOT NULL DEFAULT 0,
				in_footer BOOL NOT NULL DEFAULT 0,
				in_aside BOOL NOT NULL DEFAULT 0,
				in_nav BOOL NOT NULL DEFAULT 0,
				in_form BOOL NOT NULL DEFAULT 0,
				in_main BOOL NOT NULL DEFAULT 0,
				in_article BOOL NOT NULL DEFAULT 0,
				in_section BOOL NOT NULL DEFAULT 0,
				in_table BOOL NOT NULL DEFAULT 0,
				in_figure BOOL NOT NULL DEFAULT 0,
				in_address BOOL NOT NULL DEFAULT 0,
				in_code BOOL NOT NULL DEFAULT 0,
				in_headline BOOL NOT NULL DEFAULT 0,
				in_list BOOL NOT NULL DEFAULT 0,
				in_paragraph BOOL NOT NULL DEFAULT 0,
				contains_headline BOOL NOT NULL DEFAULT 0,
				destination_type INTEGER NULL,
				link_locality INTEGER NOT NULL,
				html_tag_name VARCHAR(31) NULL,
				link_to_fragment VARCHAR(255) NULL,
				text TEXT NULL
			);"
		,())?;
		
		info!("Table: document_description ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS document_description (
				--description id is file id
				entity_generation_id INTEGER NOT NULL PRIMARY KEY,

				time_created_unix_utc INTEGER NULL,
				time_updated_unix_utc INTEGER NULL,

				primary_language VARCHAR(31) NULL,
				title TEXT NULL,
				primary_headline TEXT NULL,
				description TEXT NULL,
				indexiness INTEGER NOT NULL
			);"
		,())?;
		
		info!("Table: text_pile ...");
		self.connection().execute("
			-- contais text relevant for full text search, but not in the document_description
			CREATE TABLE IF NOT EXISTS text_pile (
				text_pile_id INTEGER NOT NULL PRIMARY KEY,
				blake2b512_digest BLOB NOT NULL UNIQUE, -- used for easier duplicate detection.

				text TEXT NOT NULL DEFAULT '' ,
				secondary_text TEXT NOT NULL DEFAULT '' ,
				big_headlines TEXT NOT NULL DEFAULT '' ,
				small_headlines TEXT NOT NULL DEFAULT '' ,
				code_text TEXT NOT NULL DEFAULT '' ,
				quote_text TEXT NOT NULL DEFAULT '' 
			);"
		,())?;

		info!("Table: token ...");
		self.connection().execute("
			-- maps normalized text tokens to numeric ids
			CREATE TABLE IF NOT EXISTS token (
				token_id INTEGER NOT NULL PRIMARY KEY,
				token_text TEXT NOT NULL,
				UNIQUE(token_text)
			);"
		,())?;

		info!("Table: token_statistics ...");
		self.connection().execute("
			-- maps normalized text tokens to numeric ids
			CREATE TABLE IF NOT EXISTS token_statistics (
				token_id INTEGER NOT NULL,
				text_pile_id INTEGER NOT NULL,
				occurances INTEGER NOT NULL,
				UNIQUE(token_id, text_pile_id)
			);"
		,())?;

		// Speeds up summarizing by having a way to look up the
		// statistics entries to delete from the index directly.
		// Since the rowids are grouped by text_pile_id
		// sqlite should also be able to fuigure out how to realize this index
		// in a very efficient way.
		info!("Index: token_statistics_by_text_pile ...");
		self.connection().execute("
			CREATE INDEX IF NOT EXISTS token_statistics_by_text_pile
			ON token_statistics(text_pile_id)
		",())?;

		info!("Setting base schema version ...");
		self.base().set_database_info("unobtanium_database_kind", Some("summary"))?;
		self.base().set_database_info("unobtanium_summary_schema_version", Some("0.0.0"))?;

		Ok(())
	}

	pub fn check_crawler_schema(&self) -> Result<(), DatabaseOpeningError> {
		if self.base().is_new() {
			return Ok(());
		}
		if let Some(kind) = self.base().fetch_database_info("unobtanium_database_kind")
			.map_err(|e| e.while_initlizing(self.base().path()))?
		{
			if kind != "summary" {
				return Err(DatabaseOpeningError::WrongDatabaseKind {
					path: self.base().path().into(),
					got_kind: kind.to_string(),
					expected_kind: "summary".to_string()
				});
			}
		}
		if let Some(version) = self.base().fetch_database_info("unobtanium_crawler_schema_version")
			.map_err(|e| e.while_initlizing(self.base().path()))?
		{
			match version.as_str() {
				"0.0.0" => Ok(()),
				_ => Err(DatabaseOpeningError::WrongSchemaVersion{
					path: self.base().path().into(),
        			schema: "summary".to_string(),
					got_version: version,
					expected_version: "0.0.0".to_string(),
				})
			}
		} else {
			Err(DatabaseOpeningError::DatabaseUnversioned {
				path: self.base().path().into()
			})
		}
	}

	//////////////////////////////////////
	// Full Text entity Index (fts5)

	/// Drop the fts5 index if it is present in preperation for regenerating it
	fn drop_full_text_entity_index(&self) -> Result<(), DatabaseError> {
		trace!("summary_db.drop_full_text_entity_index()");
		self.connection().execute("DROP TABLE IF EXISTS full_text_entity_index;",())?;
		Ok(())
	}

	/// Drop and rebuild the fts5 full text index.
	pub fn regenerate_full_text_entity_index(&self) -> Result<(), DatabaseError> {
		self.drop_full_text_entity_index()?;
		trace!("summary_db.regenerate_full_text_entity_index()");
		info!("Table: full_text_entity_index ...");
		self.connection().execute("
			CREATE VIRTUAL TABLE full_text_entity_index USING fts5 (
				text_pile_id UNINDEXED, -- NOT NULL
				any, -- NOT NULL
				title,
				description,
				text,
				secondary_text,
				big_headlines,
				small_headlines,
				code_text,
				quote_text,
				tokenize='porter unicode61'
			);"
		,())?;
		self.connection().execute("
			INSERT INTO full_text_entity_index(
				text_pile_id,
				any,
				title,
				description,
				text,
				secondary_text,
				big_headlines,
				small_headlines,
				code_text,
				quote_text
			) SELECT
				text_pile.text_pile_id,
				'any',
				title,
				description,
				text,
				secondary_text,
				big_headlines,
				small_headlines,
				code_text,
				quote_text
			FROM text_pile
			INNER JOIN entity_generation ON
				text_pile.text_pile_id = entity_generation.text_pile_id
			INNER JOIN document_description ON
				document_description.entity_generation_id = entity_generation.entity_generation_id
			LEFT JOIN duplicate_summary ON
				duplicate_summary.subject_entity_generation_id = entity_generation.entity_generation_id
				AND duplicate_status_end_unix_utc is NULL
			WHERE
				duplicate_summary.subject_entity_generation_id IS NULL AND (
					(text IS NOT NULL AND text != '') OR
					(secondary_text IS NOT NULL AND secondary_text != '')
				) AND title != ''
			;"
		,())?;
		Ok(())
	}


}