use log::trace;
use rusqlite::Error;
use uuid::Uuid;
use std::collections::HashSet;
use crate::crawling::CrawlType;
use crate::crawling::ExitCode;
use crate::database::id::CrawlSummaryId;
use crate::database::id::EntityGenerationId;
use crate::database::DatabaseError;
use crate::database::Page;
use crate::database::sqlite_helper::*;
use crate::database::summary::structs::SummaryDatabase;
use crate::database::summary::structs::SummaryDatabaseTransaction;
use crate::summary::CrawlSummary;
use crate::summary::HttpSummary;
use crate::summary::WithEntityGenerationUuid;
impl SummaryDatabase {
pub fn get_crawl_summaries_for_entity_generation(
&self,
entity_generation_id: EntityGenerationId,
page: Page,
) -> Result<Vec<CrawlSummary>, DatabaseError> {
trace!("summary_db.get_crawl_summaries_for_entity_generation()");
let mut get_crawl_summaries_statement = self.connection().prepare("
SELECT
crawl_summary.time_started_unix_utc,
crawl_summary.was_robotstxt_approved,
crawl_summary.crawl_type,
crawl_summary.crawl_uuid,
crawl_summary.agent_uuid,
crawl_summary.exit_code,
crawl_summary.time_last_modified_unix_utc,
crawl_summary.request_duration_ms,
http_summary.status_code,
http_summary.etag
FROM crawl_summary
INNER JOIN http_summary ON
crawl_summary.crawl_summary_id = http_summary.crawl_summary_id
WHERE crawl_summary.entity_generation_id = ?
LIMIT ?
OFFSET ?
")?;
return get_crawl_summaries_statement.query_map(
(entity_generation_id, page.limit(), page.offset()),
crawl_summary_row_to_crawl_summary
)?.map(|r| r.map_err(Into::into)).collect();
}
pub fn get_crawl_summary_by_id(
&self,
crawl_summary_id: CrawlSummaryId,
) -> Result<CrawlSummary, DatabaseError> {
trace!("summary_db.get_crawl_summary_by_id()");
self.connection().query_row(
"SELECT
crawl_summary.time_started_unix_utc,
crawl_summary.was_robotstxt_approved,
crawl_summary.crawl_type,
crawl_summary.crawl_uuid,
crawl_summary.agent_uuid,
crawl_summary.exit_code,
crawl_summary.time_last_modified_unix_utc,
crawl_summary.request_duration_ms,
http_summary.status_code,
http_summary.etag
FROM crawl_summary
INNER JOIN http_summary ON
crawl_summary.crawl_summary_id = http_summary.crawl_summary_id
WHERE crawl_summary.crawl_summary_id = ?
", (crawl_summary_id,),
crawl_summary_row_to_crawl_summary
).map_err(Into::into)
}
pub fn test_has_crawl_summary_with_uuid_bulk(
&self,
crawl_uuids: &[Uuid],
) -> Result<HashSet<Uuid>, DatabaseError> {
let mut statement = self.connection().prepare("
SELECT crawl_uuid
FROM crawl_summary
WHERE crawl_uuid = ?
")?;
let mut found_uuids = HashSet::new();
for uuid in crawl_uuids {
for res_uuid in statement.query_map((uuid,), |row| {
return row.get::<usize,Uuid>(0);
})? {
found_uuids.insert(res_uuid?);
}
}
return Ok(found_uuids);
}
}
impl SummaryDatabaseTransaction<'_> {
pub fn add_crawl_summary_bulk(
&mut self,
summaries: &[WithEntityGenerationUuid<CrawlSummary>],
) -> Result<(), DatabaseError> {
trace!("summary_db_transaction.add_crawl_summary_bulk()");
let mut store_crawl_statement = self.connection().prepare_cached(
"INSERT INTO crawl_summary (
entity_generation_id,
was_robotstxt_approved,
crawl_type,
time_started_unix_utc,
crawl_uuid,
agent_uuid,
exit_code,
time_last_modified_unix_utc,
request_duration_ms
) VALUES (
?,?,?,?, ?,?,?,?, ?
)"
)?;
let mut update_entity_generation_statement = self.connection().prepare_cached(
"UPDATE entity_generation
SET first_seen_unix_utc = min(first_seen_unix_utc, ?),
last_seen_unix_utc = max(last_seen_unix_utc, ?)
WHERE entity_generation_id = ?
"
)?;
let mut add_http_summary_statement = self.connection().prepare_cached(
"INSERT INTO http_summary (
crawl_summary_id,
status_code,
etag
) VALUES (
?,?,?
)"
)?;
for summary in summaries {
let entity_generation_id = self.get_entity_generation_id(summary.entity_generation_uuid)?;
let crawl_timestamp = summary.data.crawl_time.timestamp();
store_crawl_statement.execute(
(
entity_generation_id,
summary.data.was_robotstxt_approved,
summary.data.crawl_type.clone().to_number(),
crawl_timestamp,
summary.data.crawl_uuid,
summary.data.agent_uuid,
summary.data.exit_code.to_number(),
to_unix_timestamp_opt(summary.data.server_last_modified),
summary.data.request_duration_ms,
)
)?;
update_entity_generation_statement.execute(
(
crawl_timestamp,
crawl_timestamp,
entity_generation_id,
)
)?;
let crawl_summary_id = self.connection().last_insert_rowid();
if let Some(http) = &summary.data.http {
add_http_summary_statement.execute(
(
crawl_summary_id,
http.status_code,
http.etag.clone(),
)
)?;
}
}
return Ok(());
}
}
fn crawl_summary_row_to_crawl_summary(
row: &rusqlite::Row,
) -> Result<CrawlSummary, Error> {
let http_status_code: Option<u16> = row.get(8)?;
let http_summary = if let Some(status_code) = http_status_code {
Some(HttpSummary{
status_code: status_code,
etag: row.get(9)?,
})
} else {
None
};
Ok(CrawlSummary{
crawl_time: from_unix_timestamp_or_epoch(row.get(0)?),
was_robotstxt_approved: row.get(1)?,
crawl_type: CrawlType::from_number(row.get(2)?),
crawl_uuid: row.get(3)?,
agent_uuid: row.get(4)?,
exit_code: ExitCode::from_number(row.get(5)?),
server_last_modified:
from_unix_timestamp_opt(row.get(6)?),
request_duration_ms: row.get(7)?,
http: http_summary,
})
}