use log::trace;
use criterium::CriteriumChain;
use criterium::rusqlite::AssembleRusqliteQuery;
use rusqlite::Error;
use crate::crawling::CrawlCandidate;
use crate::crawling::ExitCode;
use crate::criterium::CrawlCandidateCriterium;
use crate::database::crawler::structs::CrawlerDatabase;
use crate::database::crawler::structs::CrawlerDatabaseTransaction;
use crate::database::id::UrlId;
use crate::database::DatabaseError;
use crate::database::error::SmuggleDatabaseErrorExtension;
use crate::database::Page;
use crate::database::sqlite_helper::*;
use crate::url::UrlWithoutFragment;
impl CrawlerDatabase {
pub fn get_crawl_candidate(
&mut self,
url: &UrlWithoutFragment
) -> Result<CrawlCandidate, DatabaseError> {
trace!("crawler_db.get_crawl_candidate()");
let url_id = self.base().read_url_id(url)?;
let mut statement = self.connection().prepare("
SELECT
crawl_candidate.last_crawl_time_unix_utc,
crawl_candidate.last_crawl_exit_code,
crawl_candidate.last_contentful_crawl_time_unix_utc,
crawl_candidate.last_contentful_http_etag
FROM crawl_candidate
WHERE crawl_candidate.url_id = ?
")?;
return statement.query_row(
(url_id,),
|row| {
Ok(CrawlCandidate{
url: url.clone(),
last_crawl_time: from_unix_timestamp_opt(row.get(0)?),
last_crawl_exit_code: ExitCode::from_number_opt(row.get(1)?),
last_contentful_crawl_time: from_unix_timestamp_opt(row.get(2)?),
last_contentful_http_etag: row.get(3)?,
})
}
).map_err(Into::into)
}
pub fn get_crawl_candidates(
&self,
page: &Page,
criterium_chain: CriteriumChain<CrawlCandidateCriterium>
) -> Result<Vec<CrawlCandidate>, DatabaseError> {
trace!("crawler_db.get_crawl_candidates()");
let mut query = criterium_chain.assemble_rusqlite_query_for_db(&());
trace!("SQL where: {}", query.sql_where_clause);
trace!("SQL joins: {}", query.joins_to_sql());
let mut statement = self.connection().prepare(
format!("
SELECT
crawl_candidate.url_id,
crawl_candidate.last_crawl_time_unix_utc,
crawl_candidate.last_crawl_exit_code,
crawl_candidate.last_contentful_crawl_time_unix_utc,
crawl_candidate.last_contentful_http_etag
FROM crawl_candidate
{}
WHERE {}
LIMIT ?
OFFSET ?",
query.joins_to_sql(),
query.sql_where_clause
).as_str()
)?;
query.where_values.push(page.limit().into());
query.where_values.push(page.offset().into());
return statement.query_map(
query.where_values_as_params(),
|row| {
Ok(CrawlCandidate{
url: self.base().get_url_by_id(row.get(0)?)
.smuggle_through_rusqlite()?,
last_crawl_time: from_unix_timestamp_opt(row.get(1)?),
last_crawl_exit_code: ExitCode::from_number_opt(row.get(2)?),
last_contentful_crawl_time: from_unix_timestamp_opt(row.get(3)?),
last_contentful_http_etag: row.get(4)?,
})
}
)?.map(|r| r.map_err(Into::into)).collect();
}
}
impl CrawlerDatabaseTransaction<'_> {
pub fn store_crawler_found_link_by_id(
&mut self,
url_id: UrlId
) -> Result<(),Error> {
self.connection().execute("
INSERT OR IGNORE INTO crawl_candidate (
url_id
) VALUES (?)
",(url_id,))?;
Ok(())
}
pub fn store_crawler_found_links(
&mut self,
links: &[UrlWithoutFragment],
) -> Result<(), DatabaseError> {
self.base_transaction.assert_writable("store_crawler_found_links")?;
trace!("crawler_db_transaction.store_crawler_found_links()");
let mut store_statement = self.connection().prepare("
INSERT OR IGNORE INTO crawl_candidate (
url_id
) VALUES (?)
")?;
for url in links {
let url_id = self.base_transaction.get_url_id(url, true)?;
store_statement.execute((
url_id,
))?;
}
Ok(())
}
pub fn store_crawl_candidates(
&mut self,
candidates: &[CrawlCandidate]
) -> Result<(), DatabaseError> {
self.base_transaction.assert_writable("store_crawl_candidates")?;
trace!("crawler_db_transaction.store_crawl_candidates()");
let mut store_statement = self.connection().prepare("
INSERT OR REPLACE INTO crawl_candidate (
url_id,
last_crawl_time_unix_utc,
last_crawl_exit_code,
last_contentful_crawl_time_unix_utc,
last_contentful_http_etag
) VALUES (?,?,?,?,?)
")?;
for candidate in candidates {
let url_id = self.base_transaction.get_url_id(&candidate.url, true)?;
store_statement.execute((
url_id,
to_unix_timestamp_opt(candidate.last_crawl_time),
candidate.last_crawl_exit_code.map(|c| c.to_number()),
to_unix_timestamp_opt(candidate.last_contentful_crawl_time),
candidate.last_contentful_http_etag.clone(),
))?;
}
Ok(())
}
}