unobtanium 3.0.0

Opinioated Web search engine library with crawler and viewer companion.
Documentation
use criterium::CriteriumChain;
use criterium::rusqlite::AssembleRusqliteQuery;
use criterium::sql::Field;
use log::trace;

use crate::criterium::EntityCriterium;
use crate::database::error::SmuggleDatabaseErrorExtension;
use crate::database::DatabaseError;
use crate::database::EntityComponentTable;
use crate::database::fields::*;
use crate::database::Page;
use crate::database::sqlite_helper::*;
use crate::database::summary::structs::SummaryDatabase;
use crate::database::SummaryTable;
use crate::search::FullTextEntityWeights;
use crate::search::SearchResult;

impl SummaryDatabase {

	/// The query that decides what shows up when one hits enter on the main search box.
	/// 
	/// Basically a `get_file_summaries()` but optimized for full text searching.
	pub fn get_search_results(
		&self,
		page: &Page,
		criterium_chain: CriteriumChain<EntityCriterium>,
		weights: FullTextEntityWeights,
	) -> Result<Vec<SearchResult>, DatabaseError> {
		trace!("summary_db.get_search_results()");
		let mut query = criterium_chain.assemble_rusqlite_query_for_db(
			&EntityComponentTable::FileSummary
		);
		query = query.inner_join(
			None,
			EntityGenerationField::EntityGenerationId.into(),
			None,
			FileSummaryField::EntityGenerationId.into(),
		).inner_join(
			None,
			DocumentDescriptionField::EntityGenerationId.into(),
			None,
			FileSummaryField::EntityGenerationId.into(),
		);
		let order_by_clause = if query.sql_where_clause.contains("MATCH") {
			// Order by bm25 and give a huge boost to articles
			// This must be tuned a bit in the future
			"ORDER BY ".to_owned() + &weights.to_bm25_sql() +
			" + CASE document_description.indexiness > 0 "+
			" WHEN TRUE THEN 100 ELSE 0 END"
		} else {
			let mut out = "".to_string();
			for join in &query.sql_joins {
				// TODO: Make this work for the more complex cases too
				if join.connector_field.table() == &SummaryTable::TokenStatistics {
					if !out.is_empty() {
						out += " + "
					}
					// TODO: take document length into account
					out = format!("{out} ln({}.occurances)", join.table_alias());
				}
			}
			if !out.is_empty() { out = format!(
				"ORDER BY (
					{out}
					+ CASE document_description.indexiness > 0
					  WHEN TRUE THEN -5 ELSE 0 END
				) DESC"
			); }
			out
		};
		let description_fetch_sql = if query.sql_where_clause.contains("MATCH") {
			"snippet(full_text_entity_index, -1, '»', '«', '…', 32)"
		} else {
			"document_description.description"
		};
		trace!("SQL where: {}", query.sql_where_clause);
		trace!("SQL where values: {:?}", query.where_values);
		trace!("SQL joins: {}", query.joins_to_sql());
		// Note: The subquery makes heavy use of the entity_generation_by_time_started index and will absolutely not scale otherwise.
		let mut get_file_summaries_statement = self.connection().prepare(
			format!("
			SELECT
				entity_generation.entity_generation_uuid,
				entity_generation.url_id,
				entity_generation.url_fragment,
				document_description.title,
				{},
				document_description.time_created_unix_utc,
				document_description.time_updated_unix_utc,
				(select count(*) from text_pile) as N
			FROM file_summary
			{}
			WHERE {}
				AND document_description.title IS NOT null
			{} --order by clause
			LIMIT ?
			OFFSET ?",
				description_fetch_sql,
				query.joins_to_sql(),
				query.sql_where_clause,
				order_by_clause,
			).as_str(),
		)?;
		query.where_values.push(page.limit().into());
		query.where_values.push(page.offset().into());

		return get_file_summaries_statement.query_map(
			query.where_values_as_params(),
			|row| {
				Ok(SearchResult {
					entity_generation_uuid: row.get(0)?,
					url: url_with_fragment_from_row(self.base(), row, 1, 2)
						.smuggle_through_rusqlite()?,
					title: row.get(3)?,
					description: row.get(4)?,
					date_published: from_unix_timestamp_opt(row.get(5)?),
					date_last_updated: from_unix_timestamp_opt(row.get(6)?),
				})
			}
		)?.map(|r| r.map_err(Into::into)).collect();
	}

}