unobtanium 3.0.0

Opinioated Web search engine library with crawler and viewer companion.
Documentation
use log::info;
use rusqlite::Error;

use crate::database::{DatabaseError, DatabaseOpeningError};
use crate::database::crawler::structs::CrawlerDatabase;


impl CrawlerDatabase {
	pub fn initalize_crawler_database(&self) -> Result<(), DatabaseError> {

		self.base().assert_writable("initalize_crawler_database")?;
		// Initalize base database
		self.base().initalize_base_database()?;

		info!("Initalizing database (crawler part) ...");

		info!("Table: agent ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS agent (
				agent_id INTEGER NOT NULL PRIMARY KEY,
				time_started_unix_utc INTEGER NOT NULL,
				time_finished_unix_utc INTEGER NULL,
				agent_uuid BLOB(16) NOT NULL,
				name TEXT NOT NULL,
				http_user_agent TEXT NULL
			);"
		,())?;
		
		info!("Table: crawl_log ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS crawl_log (
				crawl_log_id INTEGER NOT NULL PRIMARY KEY,
				agent_id INTEGER NOT NULL,
				url_id INTEGER NOT NULL,
				-- crawl type numbers are predefined!
				crawl_type INTEGER NOT NULL,
				crawl_uuid BLOB(16) UNIQUE NOT NULL,
				time_started_unix_utc INTEGER NOT NULL,
				time_taken_ms INTEGER NULL,
				exit_code INTEGER NOT NULL,
				message TEXT NULL
			);"
		,())?;

		info!("Index: crawl_log_quickinfo on crawl_log ...");
		// Should speed up querying for the las crawl results of a url
		self.connection().execute(
			"CREATE INDEX IF NOT EXISTS crawl_log_quickinfo ON crawl_log(url_id,time_started_unix_utc,exit_code);"
		,())?;

		info!("Table: request ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS request (
				request_id INTEGER NOT NULL PRIMARY KEY,
				crawl_log_id INTEGER NOT NULL,
				url_id INTEGER NOT NULL,
				time_sent_unix_utc INTEGER NOT NULL,
				request_duration_ms INTEGER NULL,
				robotstxt_approved BOOL NOT NULL,
				exit_code INTEGER NOT NULL,
				server_last_modified_unix_utc INTEGER NULL,
				http_status_code INTEGER NULL,
				http_etag VARCHAR(63) NULL
			);"
		,())?;

		info!("Table: file ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS file (
				file_id INTEGER NOT NULL PRIMARY KEY,
				crawl_log_id INTEGER NOT NULL,
				request_id INTEGER NULL,
				url_id INTEGER NOT NULL,
				last_modified_unix_utc INTEGER NULL,
				-- file specific fields
				-- file size is allowed to be null when the file wasn't fully fetched.
				file_size INTEGER NULL,
				mimetype_id INTEGER NOT NULL,
				canonical_url_id INTEGER NULL
			);"
		,())?;

		info!("Table: redirect ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS redirect (
				redirect_id INTEGER NOT NULL PRIMARY KEY,
				crawl_log_id INTEGER NOT NULL,
				request_id INTEGER NULL,
				url_id INTEGER NOT NULL,
				last_modified_unix_utc INTEGER NULL,
				-- redirect specific fields
				to_url_id INTEGER NOT NULL,
				information_source INTEGER NOT NULL,
				is_permanent BOOL NOT NULL,
				by_security_policy BOOL NOT NULL
			);"
		,())?;

		info!("Table: file_text ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS file_text (
				file_id INTEGER NOT NULL PRIMARY KEY,
				text TEXT NOT NULL
			)"
		,())?;

		info!("Table: crawl_candidate ...");
		self.connection().execute("
			--Note: crawl results needed for rescheduling
			--      are duplicated here so that this table is
			--      independent of the actual results being present.
			
			CREATE TABLE IF NOT EXISTS crawl_candidate (
				url_id INTEGER NOT NULL PRIMARY KEY,
				last_crawl_time_unix_utc INTEGER NULL,
				last_crawl_exit_code INTEGER NULL,
				last_contentful_crawl_time_unix_utc INTEGER NULL,
				last_contentful_http_etag TEXT NULL
			)"
		,())?;

		self.base().set_database_info("unobtanium_database_kind", Some("crawler"))?;
		self.base().set_database_info("unobtanium_crawler_schema_version", Some("1.0.0"))?;
		
		info!("Database (crawler part) successfully initialized!");
		Ok(())
	}

	pub fn check_crawler_schema(&self) -> Result<(), DatabaseOpeningError> {
		if self.base().is_new() {
			return Ok(());
		}
		if let Some(kind) = self.base().fetch_database_info("unobtanium_database_kind")
			.map_err(|e| e.while_initlizing(self.base().path()))?
		{
			if kind != "crawler" {
				return Err(DatabaseOpeningError::WrongDatabaseKind {
					path: self.base().path().into(),
					got_kind: kind.to_string(),
					expected_kind: "crawler".to_string()
				});
			}
		}
		if let Some(version) = self.base().fetch_database_info("unobtanium_crawler_schema_version")
			.map_err(|e| e.while_initlizing(self.base().path()))?
		{
			match version.as_str() {
				"1.0.0" => Ok(()),
				_ => Err(DatabaseOpeningError::WrongSchemaVersion{
					path: self.base().path().into(),
        			schema: "crawler".to_string(),
					got_version: version,
					expected_version: "1.0.0".to_string(),
				})
			}
		} else {
			Err(DatabaseOpeningError::DatabaseUnversioned {
				path: self.base().path().into()
			})
		}
	}

	pub fn initalize_crawler_database_temp_tables(&self) -> Result<(),Error> {
		info!("Initalizing temporary crawler database tables …");
		
		info!("Table: temp.ignore_url ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS temp.ignore_url (
				url_id INTEGER NOT NULL,
				agent_id INTEGER NOT NULL,
				UNIQUE(url_id, agent_id)
			);"
		,())?;
		
		info!("Table: temp.ignore_origin ...");
		self.connection().execute("
			CREATE TABLE IF NOT EXISTS temp.ignore_origin (
				origin_id INTEGER NOT NULL,
				agent_id INTEGER NOT NULL,
				UNIQUE(origin_id, agent_id)
			);"
		,())?;

		info!("Database (crawler temporary tables) successfully initialized!");
		Ok(())
	}
	
}