use log::info;
use rusqlite::Error;
use crate::database::{DatabaseError, DatabaseOpeningError};
use crate::database::crawler::structs::CrawlerDatabase;
impl CrawlerDatabase {
pub fn initalize_crawler_database(&self) -> Result<(), DatabaseError> {
self.base().assert_writable("initalize_crawler_database")?;
self.base().initalize_base_database()?;
info!("Initalizing database (crawler part) ...");
info!("Table: agent ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS agent (
agent_id INTEGER NOT NULL PRIMARY KEY,
time_started_unix_utc INTEGER NOT NULL,
time_finished_unix_utc INTEGER NULL,
agent_uuid BLOB(16) NOT NULL,
name TEXT NOT NULL,
http_user_agent TEXT NULL
);"
,())?;
info!("Table: crawl_log ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS crawl_log (
crawl_log_id INTEGER NOT NULL PRIMARY KEY,
agent_id INTEGER NOT NULL,
url_id INTEGER NOT NULL,
-- crawl type numbers are predefined!
crawl_type INTEGER NOT NULL,
crawl_uuid BLOB(16) UNIQUE NOT NULL,
time_started_unix_utc INTEGER NOT NULL,
time_taken_ms INTEGER NULL,
exit_code INTEGER NOT NULL,
message TEXT NULL
);"
,())?;
info!("Index: crawl_log_quickinfo on crawl_log ...");
self.connection().execute(
"CREATE INDEX IF NOT EXISTS crawl_log_quickinfo ON crawl_log(url_id,time_started_unix_utc,exit_code);"
,())?;
info!("Table: request ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS request (
request_id INTEGER NOT NULL PRIMARY KEY,
crawl_log_id INTEGER NOT NULL,
url_id INTEGER NOT NULL,
time_sent_unix_utc INTEGER NOT NULL,
request_duration_ms INTEGER NULL,
robotstxt_approved BOOL NOT NULL,
exit_code INTEGER NOT NULL,
server_last_modified_unix_utc INTEGER NULL,
http_status_code INTEGER NULL,
http_etag VARCHAR(63) NULL
);"
,())?;
info!("Table: file ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS file (
file_id INTEGER NOT NULL PRIMARY KEY,
crawl_log_id INTEGER NOT NULL,
request_id INTEGER NULL,
url_id INTEGER NOT NULL,
last_modified_unix_utc INTEGER NULL,
-- file specific fields
-- file size is allowed to be null when the file wasn't fully fetched.
file_size INTEGER NULL,
mimetype_id INTEGER NOT NULL,
canonical_url_id INTEGER NULL
);"
,())?;
info!("Table: redirect ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS redirect (
redirect_id INTEGER NOT NULL PRIMARY KEY,
crawl_log_id INTEGER NOT NULL,
request_id INTEGER NULL,
url_id INTEGER NOT NULL,
last_modified_unix_utc INTEGER NULL,
-- redirect specific fields
to_url_id INTEGER NOT NULL,
information_source INTEGER NOT NULL,
is_permanent BOOL NOT NULL,
by_security_policy BOOL NOT NULL
);"
,())?;
info!("Table: file_text ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS file_text (
file_id INTEGER NOT NULL PRIMARY KEY,
text TEXT NOT NULL
)"
,())?;
info!("Table: crawl_candidate ...");
self.connection().execute("
--Note: crawl results needed for rescheduling
-- are duplicated here so that this table is
-- independent of the actual results being present.
CREATE TABLE IF NOT EXISTS crawl_candidate (
url_id INTEGER NOT NULL PRIMARY KEY,
last_crawl_time_unix_utc INTEGER NULL,
last_crawl_exit_code INTEGER NULL,
last_contentful_crawl_time_unix_utc INTEGER NULL,
last_contentful_http_etag TEXT NULL
)"
,())?;
self.base().set_database_info("unobtanium_database_kind", Some("crawler"))?;
self.base().set_database_info("unobtanium_crawler_schema_version", Some("1.0.0"))?;
info!("Database (crawler part) successfully initialized!");
Ok(())
}
pub fn check_crawler_schema(&self) -> Result<(), DatabaseOpeningError> {
if self.base().is_new() {
return Ok(());
}
if let Some(kind) = self.base().fetch_database_info("unobtanium_database_kind")
.map_err(|e| e.while_initlizing(self.base().path()))?
{
if kind != "crawler" {
return Err(DatabaseOpeningError::WrongDatabaseKind {
path: self.base().path().into(),
got_kind: kind.to_string(),
expected_kind: "crawler".to_string()
});
}
}
if let Some(version) = self.base().fetch_database_info("unobtanium_crawler_schema_version")
.map_err(|e| e.while_initlizing(self.base().path()))?
{
match version.as_str() {
"1.0.0" => Ok(()),
_ => Err(DatabaseOpeningError::WrongSchemaVersion{
path: self.base().path().into(),
schema: "crawler".to_string(),
got_version: version,
expected_version: "1.0.0".to_string(),
})
}
} else {
Err(DatabaseOpeningError::DatabaseUnversioned {
path: self.base().path().into()
})
}
}
pub fn initalize_crawler_database_temp_tables(&self) -> Result<(),Error> {
info!("Initalizing temporary crawler database tables …");
info!("Table: temp.ignore_url ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS temp.ignore_url (
url_id INTEGER NOT NULL,
agent_id INTEGER NOT NULL,
UNIQUE(url_id, agent_id)
);"
,())?;
info!("Table: temp.ignore_origin ...");
self.connection().execute("
CREATE TABLE IF NOT EXISTS temp.ignore_origin (
origin_id INTEGER NOT NULL,
agent_id INTEGER NOT NULL,
UNIQUE(origin_id, agent_id)
);"
,())?;
info!("Database (crawler temporary tables) successfully initialized!");
Ok(())
}
}