use log::trace;
use crate::crawling::File;
use crate::database::crawler::structs::CrawlerDatabase;
use crate::database::crawler::structs::CrawlerDatabaseTransaction;
use crate::database::id::FileInfoId;
use crate::database::id::NumericDatabseId;
use crate::database::DatabaseError;
use crate::database::error::SmuggleDatabaseErrorExtension;
use crate::database::id::CrawlLogEntryId;
use crate::database::Page;
use crate::database::sqlite_helper::*;
use crate::database::WithNumericId;
use crate::url::UrlWithoutFragment;
impl CrawlerDatabase {
pub fn get_file_info(&self, id: FileInfoId) -> Result<File, DatabaseError> {
trace!("crawler_db.get_file_info()");
return self.connection().query_row(
"SELECT
crawl_log_id,
request_id,
url_id,
last_modified_unix_utc,
file_size,
mimetype_id,
canonical_url_id,
file_id
FROM file
WHERE file_id = ?
", (id,),
|row| {
Ok(file_row_to_struct(self, row))
}
)?; }
pub fn get_latest_file_info_by_url(
&self,
url: &UrlWithoutFragment
) -> Result<WithNumericId<File, FileInfoId>, DatabaseError> {
trace!("crawler_db.get_latest_file_info_by_url()");
let url_id = self.base().read_url_id(url)?;
return self.connection().query_row(
"SELECT
file.crawl_log_id,
file.request_id,
file.url_id,
file.last_modified_unix_utc,
file.file_size,
file.mimetype_id,
file.canonical_url_id,
file.file_id
FROM file
INNER JOIN crawl_log ON crawl_log.crawl_log_id = file.crawl_log_id
WHERE file.url_id = ?
ORDER BY crawl_log.time_started_unix_utc DESC
LIMIT 1
", (url_id,),
|row| {
match file_row_to_struct(self,row) {
Err(e) => Ok(Err(e)),
Ok(file) => Ok(Ok(WithNumericId{
id: row.get(7)?,
data: file,
})),
}
}
)?;
}
pub fn get_file_infos(
&self,
page: &Page
) -> Result<Vec<WithNumericId<File, FileInfoId>>, DatabaseError> {
trace!("crawler_db.get_file_infos()");
let mut selector = self.connection().prepare(
"SELECT
crawl_log_id,
request_id,
url_id,
last_modified_unix_utc,
file_size,
mimetype_id,
canonical_url_id,
file_id
FROM file
LIMIT ?
OFFSET ?
")?;
return selector.query_map(
(page.limit(), page.offset()),
|row| { Ok(WithNumericId{
id: row.get(7)?,
data: file_row_to_struct(self,row).smuggle_through_rusqlite()?,
})}
)?.map(|r| r.map_err(Into::into)).collect();
}
}
impl CrawlerDatabaseTransaction<'_> {
pub fn store_file_info(
&mut self,
file: &File
) -> Result<FileInfoId, DatabaseError> {
self.base_transaction.assert_writable("store_file_info")?;
trace!("crawler_db_transaction.store_file_info()");
let url_id = self.base_transaction.get_url_id(&file.url, true)?;
let canonical_url_id = if let Some(url) = &file.canonical_url {
let canonical_url_id = self.base_transaction.get_url_id(url, true)?;
self.store_crawler_found_link_by_id(canonical_url_id)?;
Some(canonical_url_id)
} else {
None
};
let mimetype_id = self.base_transaction.get_mimetype_id(&file.mime, true)?;
self.connection().execute(
"INSERT INTO file (
crawl_log_id,
request_id,
url_id,
last_modified_unix_utc,
file_size,
mimetype_id,
canonical_url_id
) Values (?,?,?,?, ?,?,?)
",(
file.crawl_log_entry,
file.request_id,
url_id,
to_unix_timestamp_opt(file.last_modified),
file.size,
mimetype_id,
canonical_url_id,
))?;
let file_id = FileInfoId::new(self.connection().last_insert_rowid());
return Ok(file_id);
}
}
fn file_row_to_struct(
db: &CrawlerDatabase,
row: &rusqlite::Row
) -> Result<File, DatabaseError> {
let mimetype = db.base().get_mimetype_by_id(row.get(5)?)?;
let canonical_url = if let Some(id) = row.get(6)? {
Some(db.base().get_url_by_id(id)?)
} else {
None
};
let file = File{
crawl_log_entry: CrawlLogEntryId::new(row.get(0)?),
request_id: row.get(1)?,
url: db.base().get_url_by_id(row.get(2)?)?,
last_modified: from_unix_timestamp_opt(row.get(3)?),
size: row.get(4)?,
mime: mimetype,
canonical_url: canonical_url,
};
return Ok(file);
}