unobtanium 3.0.0

Opinioated Web search engine library with crawler and viewer companion.
Documentation
use log::trace;
use criterium::CriteriumChain;
use criterium::rusqlite::AssembleRusqliteQuery;
use rusqlite::Error;

use crate::crawling::CrawlCandidate;
use crate::crawling::ExitCode;
use crate::criterium::CrawlCandidateCriterium;
use crate::database::crawler::structs::CrawlerDatabase;
use crate::database::crawler::structs::CrawlerDatabaseTransaction;
use crate::database::id::UrlId;
use crate::database::DatabaseError;
use crate::database::error::SmuggleDatabaseErrorExtension;
use crate::database::Page;
use crate::database::sqlite_helper::*;
use crate::url::UrlWithoutFragment;

impl CrawlerDatabase {
	pub fn get_crawl_candidate(
		&mut self,
		url: &UrlWithoutFragment
	) -> Result<CrawlCandidate, DatabaseError> {
		trace!("crawler_db.get_crawl_candidate()");
		let url_id = self.base().read_url_id(url)?;
		let mut statement = self.connection().prepare("
			SELECT
				crawl_candidate.last_crawl_time_unix_utc,
				crawl_candidate.last_crawl_exit_code,
				crawl_candidate.last_contentful_crawl_time_unix_utc,
				crawl_candidate.last_contentful_http_etag
			FROM crawl_candidate
			WHERE crawl_candidate.url_id = ?
		")?;
		return statement.query_row(
			(url_id,),
			|row| {
				Ok(CrawlCandidate{
					url: url.clone(),
					last_crawl_time: from_unix_timestamp_opt(row.get(0)?),
					last_crawl_exit_code: ExitCode::from_number_opt(row.get(1)?),
					last_contentful_crawl_time: from_unix_timestamp_opt(row.get(2)?),
					last_contentful_http_etag: row.get(3)?,
				})
			}
		).map_err(Into::into)
	}

	pub fn get_crawl_candidates(
		&self,
		page: &Page,
		criterium_chain: CriteriumChain<CrawlCandidateCriterium>
	) -> Result<Vec<CrawlCandidate>, DatabaseError> {
		trace!("crawler_db.get_crawl_candidates()");
		let mut query = criterium_chain.assemble_rusqlite_query_for_db(&());
		trace!("SQL where: {}", query.sql_where_clause);
		trace!("SQL joins: {}", query.joins_to_sql());
		let mut statement = self.connection().prepare(
			format!("
				SELECT
					crawl_candidate.url_id,
					crawl_candidate.last_crawl_time_unix_utc,
					crawl_candidate.last_crawl_exit_code,
					crawl_candidate.last_contentful_crawl_time_unix_utc,
					crawl_candidate.last_contentful_http_etag
				FROM crawl_candidate
				{}
				WHERE {}
				LIMIT ?
				OFFSET ?",
				query.joins_to_sql(),
				query.sql_where_clause
			).as_str()
		)?;
		query.where_values.push(page.limit().into());
		query.where_values.push(page.offset().into());

		return statement.query_map(
			query.where_values_as_params(),
			|row| {
				Ok(CrawlCandidate{
					url: self.base().get_url_by_id(row.get(0)?)
						.smuggle_through_rusqlite()?,
					last_crawl_time: from_unix_timestamp_opt(row.get(1)?),
					last_crawl_exit_code: ExitCode::from_number_opt(row.get(2)?),
					last_contentful_crawl_time: from_unix_timestamp_opt(row.get(3)?),
					last_contentful_http_etag: row.get(4)?,
				})
			}
		)?.map(|r| r.map_err(Into::into)).collect();

	}
	
}

impl CrawlerDatabaseTransaction<'_> {

	/// Ctore a link for an already known URL id, this is used when storing redirects.
	pub fn store_crawler_found_link_by_id(
		&mut self,
		url_id: UrlId
	) -> Result<(),Error> {
		self.connection().execute("
			INSERT OR IGNORE INTO crawl_candidate (
				url_id
			) VALUES (?)
		",(url_id,))?;
		Ok(())
	}

	/// Store links found by the crawler as crawl candidates.
	pub fn store_crawler_found_links(
		&mut self,
		links: &[UrlWithoutFragment],
	) -> Result<(), DatabaseError> {
		self.base_transaction.assert_writable("store_crawler_found_links")?;
		trace!("crawler_db_transaction.store_crawler_found_links()");
		let mut store_statement = self.connection().prepare("
			INSERT OR IGNORE INTO crawl_candidate (
				url_id
			) VALUES (?)
		")?;
		for url in links {
			let url_id = self.base_transaction.get_url_id(url, true)?;
			store_statement.execute((
				url_id,
			))?;
		}
		Ok(())
	}

	/// Add or update crawl candidates with new data.
	pub fn store_crawl_candidates(
		&mut self,
		candidates: &[CrawlCandidate]
	) -> Result<(), DatabaseError> {
		self.base_transaction.assert_writable("store_crawl_candidates")?;

		trace!("crawler_db_transaction.store_crawl_candidates()");
		let mut store_statement = self.connection().prepare("
			INSERT OR REPLACE INTO crawl_candidate (
				url_id,
				last_crawl_time_unix_utc,
				last_crawl_exit_code,
				last_contentful_crawl_time_unix_utc,
				last_contentful_http_etag
			) VALUES (?,?,?,?,?)
		")?; 
		for candidate in candidates {
			let url_id = self.base_transaction.get_url_id(&candidate.url, true)?;
			store_statement.execute((
				url_id,
				to_unix_timestamp_opt(candidate.last_crawl_time),
				candidate.last_crawl_exit_code.map(|c| c.to_number()),
				to_unix_timestamp_opt(candidate.last_contentful_crawl_time),
				candidate.last_contentful_http_etag.clone(),
			))?;
		}
		Ok(())
	}

}