progscrape_application/persist/
mod.rs

1use std::{borrow::Cow, collections::HashMap, ops::AddAssign, path::PathBuf};
2
3use crate::story::{Story, StoryEvaluator, StoryIdentifier, StoryTagger};
4use progscrape_scrapers::{ScrapeCollection, StoryDate, StoryUrl, TypedScrape};
5use serde::{Deserialize, Serialize};
6use thiserror::Error;
7
8mod backerupper;
9mod db;
10mod index;
11mod memindex;
12mod scrapestore;
13mod shard;
14
15pub use backerupper::{BackerUpper, BackupResult};
16pub use index::StoryIndex;
17pub use memindex::MemIndex;
18pub use shard::Shard;
19
20use self::shard::ShardRange;
21
22#[derive(Error, Debug)]
23pub enum PersistError {
24    #[error("SQLite error")]
25    SQLiteError(#[from] rusqlite::Error),
26    #[error("Tantivy error")]
27    TantivyError(#[from] tantivy::TantivyError),
28    #[error("Tantivy error")]
29    TantivyPathError(#[from] tantivy::directory::error::OpenDirectoryError),
30    #[error("Tantivy query parser error")]
31    TantivyQueryError(#[from] tantivy::query::QueryParserError),
32    #[error("JSON error")]
33    JsonError(#[from] serde_json::Error),
34    #[error("Serialize/deserialize error")]
35    SerdeError(#[from] serde_rusqlite::Error),
36    #[error("I/O error")]
37    IOError(#[from] std::io::Error),
38    #[error("Unexpected error")]
39    UnexpectedError(String),
40}
41
42#[derive(Debug, Default, Serialize, Deserialize)]
43pub struct ShardSummary {
44    pub story_count: usize,
45    pub scrape_count: usize,
46}
47
48#[derive(Debug, Default, Serialize, Deserialize)]
49pub struct StorageSummary {
50    pub by_shard: Vec<(String, ShardSummary)>,
51    pub total: ShardSummary,
52}
53
54#[derive(Debug)]
55/// The type of story fetch to perform.
56pub enum StoryQuery {
57    /// A single story.
58    ById(StoryIdentifier),
59    /// All stories from a given shard.
60    ByShard(Shard),
61    /// Front page stories.
62    FrontPage,
63    /// Stories matching a tag query (second item in tuple is the alternative).
64    TagSearch(String, Option<String>),
65    /// Stories matching a domain query.
66    DomainSearch(String),
67    /// Stories matching a specific URL.
68    UrlSearch(StoryUrl),
69    /// Stories matching a text search.
70    TextSearch(String),
71    /// Related stories (title, tags)
72    Related(String, Vec<String>),
73}
74
75/// A string that may be turned into a [`StoryQuery`].
76pub trait IntoStoryQuery {
77    fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery;
78}
79
80trait StoryQueryString: AsRef<str> {}
81
82impl<'a> StoryQueryString for &'a str {}
83impl StoryQueryString for String {}
84impl<'a> StoryQueryString for &String {}
85impl<'a> StoryQueryString for Cow<'a, str> {}
86
87impl<S: StoryQueryString> IntoStoryQuery for S {
88    fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery {
89        StoryQuery::from_search(tagger, self.as_ref())
90    }
91}
92
93impl<S: StoryQueryString> IntoStoryQuery for &Option<S> {
94    fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery {
95        let Some(s) = self else {
96            return StoryQuery::FrontPage;
97        };
98        s.as_ref().into_story_query(tagger)
99    }
100}
101
102impl<S: StoryQueryString> IntoStoryQuery for Option<S> {
103    fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery {
104        (&self).into_story_query(tagger)
105    }
106}
107
108impl StoryQuery {
109    /// Reconstructs the query text for the given query.
110    pub fn query_text(&self) -> Cow<str> {
111        match self {
112            Self::FrontPage => "".into(),
113            Self::ById(id) => format!("id={id}").into(),
114            Self::ByShard(shard) => format!("shard={shard:?}").into(),
115            Self::DomainSearch(domain) => domain.into(),
116            Self::UrlSearch(url) => url.to_string().into(),
117            Self::TagSearch(tag, _) => tag.into(),
118            Self::TextSearch(text) => text.into(),
119            // TODO: This probably won't work
120            Self::Related(title, tags) => format!("title:{title:?} tags:{tags:?}").into(),
121        }
122    }
123
124    pub fn from_search(tagger: &StoryTagger, search: &str) -> Self {
125        // Always trim whitespace
126        let search = search.trim();
127
128        // An empty search or a search containing no alphanumeric chars is shunted to the frontpage
129        if search.is_empty() || !search.contains(|c: char| c.is_alphanumeric()) {
130            return Self::FrontPage;
131        }
132
133        // This isn't terribly smart, buuuuut it allows us to search either a tag or site
134        if let Some(tag) = tagger.check_tag_search(search) {
135            let alt = if tag.eq_ignore_ascii_case(search) {
136                None
137            } else {
138                Some(search.to_ascii_lowercase())
139            };
140            StoryQuery::TagSearch(tag.to_string(), alt)
141        } else if let Some(domain_or_url) = Self::try_domain_or_url(search) {
142            domain_or_url
143        } else {
144            StoryQuery::TextSearch(search.to_string())
145        }
146    }
147
148    fn try_domain_or_url(search: &str) -> Option<StoryQuery> {
149        // Only test a domain search if the search contains a domain-like char
150        if search.contains('.') || search.contains(':') {
151            let url = if search.contains(':') {
152                StoryUrl::parse(search)
153            } else {
154                // TODO: We probably don't want to re-parse this as a URL, but it's the fastest way to normalize it
155                StoryUrl::parse(format!("http://{}", search))
156            };
157            if let Some(url) = url {
158                let host = url.host();
159                if host.contains(|c: char| !c.is_alphanumeric() && c != '.' && c != '-')
160                    || !host.contains(|c: char| c.is_alphanumeric() || c == '-')
161                {
162                    None
163                } else {
164                    if search.contains('/') {
165                        Some(StoryQuery::UrlSearch(url))
166                    } else {
167                        Some(StoryQuery::DomainSearch(url.host().to_owned()))
168                    }
169                }
170            } else {
171                None
172            }
173        } else {
174            None
175        }
176    }
177}
178
179pub trait StoryScrapePayload: Send + Sync {}
180
181impl StoryScrapePayload for () {}
182impl StoryScrapePayload for Shard {}
183impl StoryScrapePayload for TypedScrape {}
184
185pub trait StorageFetch<S: StoryScrapePayload> {
186    fn fetch_type(&self, query: StoryQuery, max: usize) -> Result<Vec<Story<S>>, PersistError>;
187}
188
189/// The underlying storage engine.
190pub trait Storage: Send + Sync {
191    /// Returns the most recent story date.
192    fn most_recent_story(&self) -> Result<StoryDate, PersistError>;
193
194    /// Returns the range of shards for this index.
195    fn shard_range(&self) -> Result<ShardRange, PersistError>;
196
197    /// Count the docs in this index, breaking it out by index segment.
198    fn story_count(&self) -> Result<StorageSummary, PersistError>;
199
200    /// Count the docs matching the query, at most max.
201    fn fetch_count(&self, query: StoryQuery, max: usize) -> Result<usize, PersistError>;
202
203    /// Fetches the index-specific story details for a single story.
204    fn fetch_detail_one(
205        &self,
206        query: StoryQuery,
207    ) -> Result<Option<HashMap<String, Vec<String>>>, PersistError>;
208
209    /// Fetch a list of stories with the specified payload type.
210    #[inline(always)]
211    fn fetch<S: StoryScrapePayload>(
212        &self,
213        query: StoryQuery,
214        max: usize,
215    ) -> Result<Vec<Story<S>>, PersistError>
216    where
217        Self: StorageFetch<S>,
218    {
219        <Self as StorageFetch<S>>::fetch_type(self, query, max)
220    }
221
222    /// Fetch a single story with the specified payload type.
223    #[inline(always)]
224    fn fetch_one<S: StoryScrapePayload>(
225        &self,
226        query: StoryQuery,
227    ) -> Result<Option<Story<S>>, PersistError>
228    where
229        Self: StorageFetch<S>,
230    {
231        Ok(<Self as StorageFetch<S>>::fetch_type(self, query, 1)?
232            .into_iter()
233            .next())
234    }
235}
236
237pub trait StorageWriter: Storage {
238    /// Insert a set of scrapes, merging with existing stories if necessary.
239    fn insert_scrapes<I: IntoIterator<Item = TypedScrape>>(
240        &mut self,
241        eval: &StoryEvaluator,
242        scrapes: I,
243    ) -> Result<Vec<ScrapePersistResult>, PersistError>;
244
245    /// Insert a set of pre-digested stories. Assumes that the underlying story does not exist and no merging is required.
246    fn insert_scrape_collections<I: IntoIterator<Item = ScrapeCollection>>(
247        &mut self,
248        eval: &StoryEvaluator,
249        stories: I,
250    ) -> Result<Vec<ScrapePersistResult>, PersistError>;
251
252    /// Given a set of existing stories, re-inserts them into the index with updated scores and tags.
253    fn reinsert_stories<I: IntoIterator<Item = StoryIdentifier>>(
254        &mut self,
255        eval: &StoryEvaluator,
256        stories: I,
257    ) -> Result<Vec<ScrapePersistResult>, PersistError>;
258}
259
260#[derive(Debug, Serialize, PartialEq, Eq, Ord, PartialOrd, Hash)]
261pub enum ScrapePersistResult {
262    /// The story was merged with an existing story whilst we tried to re-insert it.
263    MergedWithExistingStory,
264    /// The scrape has already been added.
265    AlreadyPartOfExistingStory,
266    /// This is a new story.
267    NewStory,
268    /// The story was not found whilst we tried to re-insert it.
269    NotFound,
270}
271
272#[derive(Default, Debug, Serialize)]
273pub struct ScrapePersistResultSummary {
274    pub merged: usize,
275    pub existing: usize,
276    pub new: usize,
277    pub not_found: usize,
278}
279
280impl AddAssign for ScrapePersistResultSummary {
281    fn add_assign(&mut self, rhs: Self) {
282        self.merged += rhs.merged;
283        self.existing += rhs.existing;
284        self.new += rhs.new;
285        self.not_found += rhs.not_found;
286    }
287}
288
289pub trait ScrapePersistResultSummarizer {
290    fn summary(&self) -> ScrapePersistResultSummary;
291}
292
293impl ScrapePersistResultSummarizer for Vec<ScrapePersistResult> {
294    fn summary(&self) -> ScrapePersistResultSummary {
295        let mut summary = ScrapePersistResultSummary::default();
296        for x in self {
297            match x {
298                &ScrapePersistResult::MergedWithExistingStory => summary.merged += 1,
299                &ScrapePersistResult::AlreadyPartOfExistingStory => summary.existing += 1,
300                &ScrapePersistResult::NewStory => summary.new += 1,
301                &ScrapePersistResult::NotFound => summary.not_found += 1,
302            }
303        }
304        summary
305    }
306}
307
308#[derive(Clone, Debug)]
309/// Where is this persistence engine storing data?
310pub enum PersistLocation {
311    /// In-memory.
312    Memory,
313    /// At a given path.
314    Path(PathBuf),
315}
316
317impl PersistLocation {
318    pub fn join<P: AsRef<std::path::Path>>(&self, p: P) -> PersistLocation {
319        match self {
320            PersistLocation::Memory => PersistLocation::Memory,
321            PersistLocation::Path(path) => PersistLocation::Path(path.join(p)),
322        }
323    }
324}