use log::{trace, info};
use crate::database::DatabaseError;
use crate::database::DatabaseOpeningError;
use crate::database::summary::structs::SummaryDatabase;
impl SummaryDatabase {
pub fn initalize_summary_database(&self) -> Result<(), DatabaseError> {
self.base().assert_writable("initalize_summary_database")?;
self.base().initalize_base_database()?;
info!("Initalizing database (summary part) ...");
info!("Table: entity_generation ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS entity_generation (
entity_generation_id INTEGER NOT NULL PRIMARY KEY,
url_id INTEGER NOT NULL,
first_seen_unix_utc INTEGER NOT NULL,
last_seen_unix_utc INTEGER NOT NULL,
confirmed_end_unix_utc INTEGER NULL,
marked_duplicate BOOL NOT NULL DEFAULT 0, -- Follows the duplicate_summary table
entity_generation_uuid BLOB(16) UNIQUE NOT NULL,
url_fragment TEXT NULL,
text_pile_id INTEGER NULL,
CHECK(first_seen_unix_utc <= last_seen_unix_utc)
);"
,())?;
info!("Index: entity_generation_by_uuid ...");
self.connection().execute("
CREATE INDEX IF NOT EXISTS entity_generation_by_uuid
ON entity_generation(entity_generation_uuid);
",())?;
info!("Index: entity_generation_by_first_seen ...");
self.connection().execute("
CREATE INDEX IF NOT EXISTS entity_generation_by_first_seen
ON entity_generation(url_id,url_fragment,first_seen_unix_utc);
",())?;
info!("Index: entity_generation_by_text_pile ...");
self.connection().execute("
CREATE INDEX IF NOT EXISTS entity_generation_by_text_pile
ON entity_generation(text_pile_id,confirmed_end_unix_utc);
",())?;
info!("Index: entity_generation_by_active ...");
self.connection().execute("
CREATE INDEX IF NOT EXISTS entity_generation_by_text_pile
ON entity_generation(marked_duplicate, confirmed_end_unix_utc);
",())?;
info!("Table: duplicate_summary ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS duplicate_summary (
subject_entity_generation_id INTEGER NOT NULL,
duplicate_of_entity_generation_id INTEGER NOT NULL,
duplicate_status_start_unix_utc INTEGER NOT NULL, -- do not forget to update entity_generation.marked_duplicate
duplicate_status_end_unix_utc INTEGER NULL
);"
,())?;
info!("Index: duplicate_summary_by_duplicate ...");
self.connection().execute("
CREATE INDEX IF NOT EXISTS duplicate_summary_by_duplicate
ON duplicate_summary(subject_entity_generation_id, duplicate_status_end_unix_utc);
",())?;
info!("Table: crawl_summary ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS crawl_summary (
crawl_summary_id INTEGER NOT NULL PRIMARY KEY,
entity_generation_id INTEGER NOT NULL,
was_robotstxt_approved BOOL NOT NULL,
crawl_type INTEGER NOT NULL,
crawl_uuid BLOB(16) NOT NULL,
agent_uuid BLOB(16) NOT NULL,
time_started_unix_utc INTEGER NOT NULL,
exit_code INTEGER NOT NULL,
time_last_modified_unix_utc INTEGER NULL,
request_duration_ms INTEGER NULL
);"
,())?;
info!("Index: crawl_summary_by_crawl_uuid ...");
self.connection().execute("
CREATE INDEX IF NOT EXISTS crawl_summary_by_crawl_uuid
ON crawl_summary(crawl_uuid);
",())?;
info!("Table: http_summary ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS http_summary (
crawl_summary_id INTEGER NOT NULL PRIMARY KEY,
status_code INTEGER NOT NULL,
etag TEXT NULL
);"
,())?;
info!("Table: file_summary ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS file_summary (
entity_generation_id INTEGER NOT NULL PRIMARY KEY,
-- file size is allowed to be null when the file wasn't fully fetched.
file_size INTEGER NULL,
mimetype_id INTEGER NULL,
canonical_url_id INTEGER NULL,
UNIQUE(entity_generation_id)
);"
,())?;
info!("Table: redirect_summary ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS redirect_summary (
entity_generation_id INTEGER NOT NULL PRIMARY KEY,
-- redirect specific fields
to_url_id INTEGER NOT NULL,
information_source INTEGER NOT NULL,
is_permanent BOOL NOT NULL,
by_security_policy BOOL NOT NULL,
to_url_fragment VARCHAR(255) NULL
);"
,())?;
info!("Table: link_summary ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS link_summary (
link_summary_id INTEGER NOT NULL PRIMARY KEY,
entity_generation_id INTEGER NOT NULL,
link_to_url INTEGER NOT NULL,
rel_nofollow BOOL NOT NULL DEFAULT 0,
rel_me BOOL NOT NULL DEFAULT 0,
rel_tag BOOL NOT NULL DEFAULT 0,
in_header BOOL NOT NULL DEFAULT 0,
in_footer BOOL NOT NULL DEFAULT 0,
in_aside BOOL NOT NULL DEFAULT 0,
in_nav BOOL NOT NULL DEFAULT 0,
in_form BOOL NOT NULL DEFAULT 0,
in_main BOOL NOT NULL DEFAULT 0,
in_article BOOL NOT NULL DEFAULT 0,
in_section BOOL NOT NULL DEFAULT 0,
in_table BOOL NOT NULL DEFAULT 0,
in_figure BOOL NOT NULL DEFAULT 0,
in_address BOOL NOT NULL DEFAULT 0,
in_code BOOL NOT NULL DEFAULT 0,
in_headline BOOL NOT NULL DEFAULT 0,
in_list BOOL NOT NULL DEFAULT 0,
in_paragraph BOOL NOT NULL DEFAULT 0,
contains_headline BOOL NOT NULL DEFAULT 0,
destination_type INTEGER NULL,
link_locality INTEGER NOT NULL,
html_tag_name VARCHAR(31) NULL,
link_to_fragment VARCHAR(255) NULL,
text TEXT NULL
);"
,())?;
info!("Table: document_description ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS document_description (
--description id is file id
entity_generation_id INTEGER NOT NULL PRIMARY KEY,
time_created_unix_utc INTEGER NULL,
time_updated_unix_utc INTEGER NULL,
primary_language VARCHAR(31) NULL,
title TEXT NULL,
primary_headline TEXT NULL,
description TEXT NULL,
indexiness INTEGER NOT NULL
);"
,())?;
info!("Table: text_pile ...");
self.connection().execute("
-- contais text relevant for full text search, but not in the document_description
CREATE TABLE IF NOT EXISTS text_pile (
text_pile_id INTEGER NOT NULL PRIMARY KEY,
blake2b512_digest BLOB NOT NULL UNIQUE, -- used for easier duplicate detection.
text TEXT NOT NULL DEFAULT '' ,
secondary_text TEXT NOT NULL DEFAULT '' ,
big_headlines TEXT NOT NULL DEFAULT '' ,
small_headlines TEXT NOT NULL DEFAULT '' ,
code_text TEXT NOT NULL DEFAULT '' ,
quote_text TEXT NOT NULL DEFAULT ''
);"
,())?;
info!("Table: token ...");
self.connection().execute("
-- maps normalized text tokens to numeric ids
CREATE TABLE IF NOT EXISTS token (
token_id INTEGER NOT NULL PRIMARY KEY,
token_text TEXT NOT NULL,
UNIQUE(token_text)
);"
,())?;
info!("Table: token_statistics ...");
self.connection().execute("
-- maps normalized text tokens to numeric ids
CREATE TABLE IF NOT EXISTS token_statistics (
token_id INTEGER NOT NULL,
text_pile_id INTEGER NOT NULL,
occurances INTEGER NOT NULL,
UNIQUE(token_id, text_pile_id)
);"
,())?;
info!("Index: token_statistics_by_text_pile ...");
self.connection().execute("
CREATE INDEX IF NOT EXISTS token_statistics_by_text_pile
ON token_statistics(text_pile_id)
",())?;
info!("Setting base schema version ...");
self.base().set_database_info("unobtanium_database_kind", Some("summary"))?;
self.base().set_database_info("unobtanium_summary_schema_version", Some("0.0.0"))?;
Ok(())
}
pub fn check_crawler_schema(&self) -> Result<(), DatabaseOpeningError> {
if self.base().is_new() {
return Ok(());
}
if let Some(kind) = self.base().fetch_database_info("unobtanium_database_kind")
.map_err(|e| e.while_initlizing(self.base().path()))?
{
if kind != "summary" {
return Err(DatabaseOpeningError::WrongDatabaseKind {
path: self.base().path().into(),
got_kind: kind.to_string(),
expected_kind: "summary".to_string()
});
}
}
if let Some(version) = self.base().fetch_database_info("unobtanium_crawler_schema_version")
.map_err(|e| e.while_initlizing(self.base().path()))?
{
match version.as_str() {
"0.0.0" => Ok(()),
_ => Err(DatabaseOpeningError::WrongSchemaVersion{
path: self.base().path().into(),
schema: "summary".to_string(),
got_version: version,
expected_version: "0.0.0".to_string(),
})
}
} else {
Err(DatabaseOpeningError::DatabaseUnversioned {
path: self.base().path().into()
})
}
}
fn drop_full_text_entity_index(&self) -> Result<(), DatabaseError> {
trace!("summary_db.drop_full_text_entity_index()");
self.connection().execute("DROP TABLE IF EXISTS full_text_entity_index;",())?;
Ok(())
}
pub fn regenerate_full_text_entity_index(&self) -> Result<(), DatabaseError> {
self.drop_full_text_entity_index()?;
trace!("summary_db.regenerate_full_text_entity_index()");
info!("Table: full_text_entity_index ...");
self.connection().execute("
CREATE VIRTUAL TABLE full_text_entity_index USING fts5 (
text_pile_id UNINDEXED, -- NOT NULL
any, -- NOT NULL
title,
description,
text,
secondary_text,
big_headlines,
small_headlines,
code_text,
quote_text,
tokenize='porter unicode61'
);"
,())?;
self.connection().execute("
INSERT INTO full_text_entity_index(
text_pile_id,
any,
title,
description,
text,
secondary_text,
big_headlines,
small_headlines,
code_text,
quote_text
) SELECT
text_pile.text_pile_id,
'any',
title,
description,
text,
secondary_text,
big_headlines,
small_headlines,
code_text,
quote_text
FROM text_pile
INNER JOIN entity_generation ON
text_pile.text_pile_id = entity_generation.text_pile_id
INNER JOIN document_description ON
document_description.entity_generation_id = entity_generation.entity_generation_id
LEFT JOIN duplicate_summary ON
duplicate_summary.subject_entity_generation_id = entity_generation.entity_generation_id
AND duplicate_status_end_unix_utc is NULL
WHERE
duplicate_summary.subject_entity_generation_id IS NULL AND (
(text IS NOT NULL AND text != '') OR
(secondary_text IS NOT NULL AND secondary_text != '')
) AND title != ''
;"
,())?;
Ok(())
}
}