use std::{borrow::Cow, collections::HashMap, ops::AddAssign, path::PathBuf};
use crate::story::{Story, StoryEvaluator, StoryIdentifier, StoryTagger};
use progscrape_scrapers::{ScrapeCollection, StoryDate, StoryUrl, TypedScrape};
use serde::{Deserialize, Serialize};
use thiserror::Error;
mod backerupper;
mod db;
mod index;
mod memindex;
mod scrapestore;
mod shard;
pub use backerupper::{BackerUpper, BackupResult};
pub use index::StoryIndex;
pub use memindex::MemIndex;
pub use shard::{Shard, ShardOrder};
use self::shard::ShardRange;
#[derive(Error, Debug)]
pub enum PersistError {
#[error("SQLite error")]
SQLiteError(#[from] rusqlite::Error),
#[error("Tantivy error")]
TantivyError(#[from] tantivy::TantivyError),
#[error("Tantivy error")]
TantivyPathError(#[from] tantivy::directory::error::OpenDirectoryError),
#[error("Tantivy query parser error")]
TantivyQueryError(#[from] tantivy::query::QueryParserError),
#[error("JSON error")]
JsonError(#[from] serde_json::Error),
#[error("Serialize/deserialize error")]
SerdeError(#[from] serde_rusqlite::Error),
#[error("I/O error")]
IOError(#[from] std::io::Error),
#[error("Unexpected error")]
UnexpectedError(String),
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct ShardSummary {
pub story_count: usize,
pub scrape_count: usize,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct StorageSummary {
pub by_shard: Vec<(String, ShardSummary)>,
pub total: ShardSummary,
}
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct SearchSummary {
pub by_shard: Vec<(String, usize)>,
pub total: usize,
}
#[derive(Debug, Clone)]
pub enum StoryQuery {
ById(StoryIdentifier),
ByShard(Shard),
FrontPage,
TagSearch(String, Option<String>),
DomainSearch(String),
UrlSearch(StoryUrl),
TextSearch(String),
RelatedSearch(String, Vec<String>),
}
pub trait IntoStoryQuery {
fn search_text(&self) -> &str;
fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery;
}
trait StoryQueryString: AsRef<str> {}
impl StoryQueryString for &str {}
impl StoryQueryString for String {}
impl StoryQueryString for &String {}
impl StoryQueryString for Cow<'_, str> {}
impl<S: StoryQueryString> IntoStoryQuery for S {
fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery {
StoryQuery::from_search(tagger, self.as_ref())
}
fn search_text(&self) -> &str {
self.as_ref().trim()
}
}
impl<S: StoryQueryString> IntoStoryQuery for &Option<S> {
fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery {
let Some(s) = self else {
return StoryQuery::FrontPage;
};
s.as_ref().into_story_query(tagger)
}
fn search_text(&self) -> &str {
self.as_ref().map(|s| s.as_ref()).unwrap_or("").trim()
}
}
impl<S: StoryQueryString> IntoStoryQuery for Option<S> {
fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery {
(&self).into_story_query(tagger)
}
fn search_text(&self) -> &str {
self.as_ref().map(|s| s.as_ref()).unwrap_or("").trim()
}
}
impl StoryQuery {
pub fn query_text(&self) -> Cow<'_, str> {
match self {
Self::FrontPage => "".into(),
Self::ById(id) => format!("id={id}").into(),
Self::ByShard(shard) => format!("shard={shard:?}").into(),
Self::DomainSearch(domain) => domain.into(),
Self::UrlSearch(url) => url.to_string().into(),
Self::TagSearch(tag, _) => tag.into(),
Self::TextSearch(text) => text.into(),
Self::RelatedSearch(title, tags) => format!("title:{title:?} tags:{tags:?}").into(),
}
}
pub fn query_type(&self) -> &'static str {
match self {
Self::FrontPage => "",
Self::ById(_) => "id",
Self::ByShard(_) => "shard",
Self::DomainSearch(_) => "domain",
Self::UrlSearch(_) => "url",
Self::TagSearch(_, _) => "tag",
Self::TextSearch(_) => "text",
Self::RelatedSearch(_, _) => "related",
}
}
pub fn from_search(tagger: &StoryTagger, search: &str) -> Self {
let search = search.trim();
if search.is_empty() || !search.contains(|c: char| c.is_alphanumeric()) {
return Self::FrontPage;
}
if let Some(tag) = tagger.check_tag_search(search) {
let alt = if tag.eq_ignore_ascii_case(search) {
None
} else {
Some(search.to_ascii_lowercase())
};
StoryQuery::TagSearch(tag.to_string(), alt)
} else if let Some(domain_or_url) = Self::try_domain_or_url(search) {
domain_or_url
} else if !search.contains(|c: char| !c.is_alphanumeric()) {
StoryQuery::TagSearch(search.to_lowercase(), None)
} else {
StoryQuery::TextSearch(search.to_string())
}
}
fn try_domain_or_url(search: &str) -> Option<StoryQuery> {
if search.contains('.') || search.contains(':') {
let url = if search.contains(':') {
StoryUrl::parse(search)
} else {
StoryUrl::parse(format!("http://{search}"))
};
if let Some(url) = url {
let host = url.host();
if host.contains(|c: char| !c.is_alphanumeric() && c != '.' && c != '-')
|| !host.contains(|c: char| c.is_alphanumeric() || c == '-')
{
None
} else if search.contains('/') {
Some(StoryQuery::UrlSearch(url))
} else {
Some(StoryQuery::DomainSearch(url.host().to_owned()))
}
} else {
None
}
} else {
None
}
}
}
pub trait StoryScrapePayload: Send + Sync {}
impl StoryScrapePayload for () {}
impl StoryScrapePayload for Shard {}
impl StoryScrapePayload for TypedScrape {}
pub trait StorageFetch<S: StoryScrapePayload> {
fn fetch_type(&self, query: &StoryQuery, max: usize) -> Result<Vec<Story<S>>, PersistError>;
}
pub trait Storage: Send + Sync {
fn most_recent_story(&self) -> Result<StoryDate, PersistError>;
fn shard_range(&self) -> Result<ShardRange, PersistError>;
fn story_count(&self) -> Result<StorageSummary, PersistError>;
fn fetch_count(&self, query: &StoryQuery, max: usize) -> Result<usize, PersistError>;
fn fetch_count_by_shard(&self, query: &StoryQuery) -> Result<SearchSummary, PersistError>;
fn fetch_detail_one(
&self,
query: &StoryQuery,
) -> Result<Option<HashMap<String, Vec<String>>>, PersistError>;
#[inline(always)]
fn fetch<S: StoryScrapePayload>(
&self,
query: &StoryQuery,
max: usize,
) -> Result<Vec<Story<S>>, PersistError>
where
Self: StorageFetch<S>,
{
<Self as StorageFetch<S>>::fetch_type(self, query, max)
}
#[inline(always)]
fn fetch_one<S: StoryScrapePayload>(
&self,
query: &StoryQuery,
) -> Result<Option<Story<S>>, PersistError>
where
Self: StorageFetch<S>,
{
Ok(<Self as StorageFetch<S>>::fetch_type(self, query, 1)?
.into_iter()
.next())
}
}
pub trait StorageWriter: Storage {
fn insert_scrapes<I: IntoIterator<Item = TypedScrape>>(
&mut self,
eval: &StoryEvaluator,
scrapes: I,
) -> Result<Vec<ScrapePersistResult>, PersistError>;
fn insert_scrape_collections<I: IntoIterator<Item = ScrapeCollection>>(
&mut self,
eval: &StoryEvaluator,
stories: I,
) -> Result<Vec<ScrapePersistResult>, PersistError>;
fn reinsert_stories<I: IntoIterator<Item = StoryIdentifier>>(
&mut self,
eval: &StoryEvaluator,
stories: I,
) -> Result<Vec<ScrapePersistResult>, PersistError>;
}
#[derive(Debug, Serialize, PartialEq, Eq, Ord, PartialOrd, Hash)]
pub enum ScrapePersistResult {
MergedWithExistingStory,
AlreadyPartOfExistingStory,
NewStory,
NotFound,
}
#[derive(Default, Debug, Serialize)]
pub struct ScrapePersistResultSummary {
pub merged: usize,
pub existing: usize,
pub new: usize,
pub not_found: usize,
}
impl AddAssign for ScrapePersistResultSummary {
fn add_assign(&mut self, rhs: Self) {
self.merged += rhs.merged;
self.existing += rhs.existing;
self.new += rhs.new;
self.not_found += rhs.not_found;
}
}
pub trait ScrapePersistResultSummarizer {
fn summary(&self) -> ScrapePersistResultSummary;
}
impl ScrapePersistResultSummarizer for Vec<ScrapePersistResult> {
fn summary(&self) -> ScrapePersistResultSummary {
let mut summary = ScrapePersistResultSummary::default();
for x in self {
match *x {
ScrapePersistResult::MergedWithExistingStory => summary.merged += 1,
ScrapePersistResult::AlreadyPartOfExistingStory => summary.existing += 1,
ScrapePersistResult::NewStory => summary.new += 1,
ScrapePersistResult::NotFound => summary.not_found += 1,
}
}
summary
}
}
#[derive(Clone, Debug)]
pub enum PersistLocation {
Memory,
Path(PathBuf),
}
impl PersistLocation {
pub fn join<P: AsRef<std::path::Path>>(&self, p: P) -> PersistLocation {
match self {
PersistLocation::Memory => PersistLocation::Memory,
PersistLocation::Path(path) => PersistLocation::Path(path.join(p)),
}
}
}