progscrape_application/persist/
mod.rs1use std::{borrow::Cow, collections::HashMap, ops::AddAssign, path::PathBuf};
2
3use crate::story::{Story, StoryEvaluator, StoryIdentifier, StoryTagger};
4use progscrape_scrapers::{ScrapeCollection, StoryDate, StoryUrl, TypedScrape};
5use serde::{Deserialize, Serialize};
6use thiserror::Error;
7
8mod backerupper;
9mod db;
10mod index;
11mod memindex;
12mod scrapestore;
13mod shard;
14
15pub use backerupper::{BackerUpper, BackupResult};
16pub use index::StoryIndex;
17pub use memindex::MemIndex;
18pub use shard::Shard;
19
20use self::shard::ShardRange;
21
22#[derive(Error, Debug)]
23pub enum PersistError {
24 #[error("SQLite error")]
25 SQLiteError(#[from] rusqlite::Error),
26 #[error("Tantivy error")]
27 TantivyError(#[from] tantivy::TantivyError),
28 #[error("Tantivy error")]
29 TantivyPathError(#[from] tantivy::directory::error::OpenDirectoryError),
30 #[error("Tantivy query parser error")]
31 TantivyQueryError(#[from] tantivy::query::QueryParserError),
32 #[error("JSON error")]
33 JsonError(#[from] serde_json::Error),
34 #[error("Serialize/deserialize error")]
35 SerdeError(#[from] serde_rusqlite::Error),
36 #[error("I/O error")]
37 IOError(#[from] std::io::Error),
38 #[error("Unexpected error")]
39 UnexpectedError(String),
40}
41
42#[derive(Debug, Default, Serialize, Deserialize)]
43pub struct ShardSummary {
44 pub story_count: usize,
45 pub scrape_count: usize,
46}
47
48#[derive(Debug, Default, Serialize, Deserialize)]
49pub struct StorageSummary {
50 pub by_shard: Vec<(String, ShardSummary)>,
51 pub total: ShardSummary,
52}
53
54#[derive(Debug)]
55pub enum StoryQuery {
57 ById(StoryIdentifier),
59 ByShard(Shard),
61 FrontPage,
63 TagSearch(String, Option<String>),
65 DomainSearch(String),
67 UrlSearch(StoryUrl),
69 TextSearch(String),
71 Related(String, Vec<String>),
73}
74
75pub trait IntoStoryQuery {
77 fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery;
78}
79
80trait StoryQueryString: AsRef<str> {}
81
82impl<'a> StoryQueryString for &'a str {}
83impl StoryQueryString for String {}
84impl<'a> StoryQueryString for &String {}
85impl<'a> StoryQueryString for Cow<'a, str> {}
86
87impl<S: StoryQueryString> IntoStoryQuery for S {
88 fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery {
89 StoryQuery::from_search(tagger, self.as_ref())
90 }
91}
92
93impl<S: StoryQueryString> IntoStoryQuery for &Option<S> {
94 fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery {
95 let Some(s) = self else {
96 return StoryQuery::FrontPage;
97 };
98 s.as_ref().into_story_query(tagger)
99 }
100}
101
102impl<S: StoryQueryString> IntoStoryQuery for Option<S> {
103 fn into_story_query(self, tagger: &StoryTagger) -> StoryQuery {
104 (&self).into_story_query(tagger)
105 }
106}
107
108impl StoryQuery {
109 pub fn query_text(&self) -> Cow<str> {
111 match self {
112 Self::FrontPage => "".into(),
113 Self::ById(id) => format!("id={id}").into(),
114 Self::ByShard(shard) => format!("shard={shard:?}").into(),
115 Self::DomainSearch(domain) => domain.into(),
116 Self::UrlSearch(url) => url.to_string().into(),
117 Self::TagSearch(tag, _) => tag.into(),
118 Self::TextSearch(text) => text.into(),
119 Self::Related(title, tags) => format!("title:{title:?} tags:{tags:?}").into(),
121 }
122 }
123
124 pub fn from_search(tagger: &StoryTagger, search: &str) -> Self {
125 let search = search.trim();
127
128 if search.is_empty() || !search.contains(|c: char| c.is_alphanumeric()) {
130 return Self::FrontPage;
131 }
132
133 if let Some(tag) = tagger.check_tag_search(search) {
135 let alt = if tag.eq_ignore_ascii_case(search) {
136 None
137 } else {
138 Some(search.to_ascii_lowercase())
139 };
140 StoryQuery::TagSearch(tag.to_string(), alt)
141 } else if let Some(domain_or_url) = Self::try_domain_or_url(search) {
142 domain_or_url
143 } else {
144 StoryQuery::TextSearch(search.to_string())
145 }
146 }
147
148 fn try_domain_or_url(search: &str) -> Option<StoryQuery> {
149 if search.contains('.') || search.contains(':') {
151 let url = if search.contains(':') {
152 StoryUrl::parse(search)
153 } else {
154 StoryUrl::parse(format!("http://{}", search))
156 };
157 if let Some(url) = url {
158 let host = url.host();
159 if host.contains(|c: char| !c.is_alphanumeric() && c != '.' && c != '-')
160 || !host.contains(|c: char| c.is_alphanumeric() || c == '-')
161 {
162 None
163 } else {
164 if search.contains('/') {
165 Some(StoryQuery::UrlSearch(url))
166 } else {
167 Some(StoryQuery::DomainSearch(url.host().to_owned()))
168 }
169 }
170 } else {
171 None
172 }
173 } else {
174 None
175 }
176 }
177}
178
179pub trait StoryScrapePayload: Send + Sync {}
180
181impl StoryScrapePayload for () {}
182impl StoryScrapePayload for Shard {}
183impl StoryScrapePayload for TypedScrape {}
184
185pub trait StorageFetch<S: StoryScrapePayload> {
186 fn fetch_type(&self, query: StoryQuery, max: usize) -> Result<Vec<Story<S>>, PersistError>;
187}
188
189pub trait Storage: Send + Sync {
191 fn most_recent_story(&self) -> Result<StoryDate, PersistError>;
193
194 fn shard_range(&self) -> Result<ShardRange, PersistError>;
196
197 fn story_count(&self) -> Result<StorageSummary, PersistError>;
199
200 fn fetch_count(&self, query: StoryQuery, max: usize) -> Result<usize, PersistError>;
202
203 fn fetch_detail_one(
205 &self,
206 query: StoryQuery,
207 ) -> Result<Option<HashMap<String, Vec<String>>>, PersistError>;
208
209 #[inline(always)]
211 fn fetch<S: StoryScrapePayload>(
212 &self,
213 query: StoryQuery,
214 max: usize,
215 ) -> Result<Vec<Story<S>>, PersistError>
216 where
217 Self: StorageFetch<S>,
218 {
219 <Self as StorageFetch<S>>::fetch_type(self, query, max)
220 }
221
222 #[inline(always)]
224 fn fetch_one<S: StoryScrapePayload>(
225 &self,
226 query: StoryQuery,
227 ) -> Result<Option<Story<S>>, PersistError>
228 where
229 Self: StorageFetch<S>,
230 {
231 Ok(<Self as StorageFetch<S>>::fetch_type(self, query, 1)?
232 .into_iter()
233 .next())
234 }
235}
236
237pub trait StorageWriter: Storage {
238 fn insert_scrapes<I: IntoIterator<Item = TypedScrape>>(
240 &mut self,
241 eval: &StoryEvaluator,
242 scrapes: I,
243 ) -> Result<Vec<ScrapePersistResult>, PersistError>;
244
245 fn insert_scrape_collections<I: IntoIterator<Item = ScrapeCollection>>(
247 &mut self,
248 eval: &StoryEvaluator,
249 stories: I,
250 ) -> Result<Vec<ScrapePersistResult>, PersistError>;
251
252 fn reinsert_stories<I: IntoIterator<Item = StoryIdentifier>>(
254 &mut self,
255 eval: &StoryEvaluator,
256 stories: I,
257 ) -> Result<Vec<ScrapePersistResult>, PersistError>;
258}
259
260#[derive(Debug, Serialize, PartialEq, Eq, Ord, PartialOrd, Hash)]
261pub enum ScrapePersistResult {
262 MergedWithExistingStory,
264 AlreadyPartOfExistingStory,
266 NewStory,
268 NotFound,
270}
271
272#[derive(Default, Debug, Serialize)]
273pub struct ScrapePersistResultSummary {
274 pub merged: usize,
275 pub existing: usize,
276 pub new: usize,
277 pub not_found: usize,
278}
279
280impl AddAssign for ScrapePersistResultSummary {
281 fn add_assign(&mut self, rhs: Self) {
282 self.merged += rhs.merged;
283 self.existing += rhs.existing;
284 self.new += rhs.new;
285 self.not_found += rhs.not_found;
286 }
287}
288
289pub trait ScrapePersistResultSummarizer {
290 fn summary(&self) -> ScrapePersistResultSummary;
291}
292
293impl ScrapePersistResultSummarizer for Vec<ScrapePersistResult> {
294 fn summary(&self) -> ScrapePersistResultSummary {
295 let mut summary = ScrapePersistResultSummary::default();
296 for x in self {
297 match x {
298 &ScrapePersistResult::MergedWithExistingStory => summary.merged += 1,
299 &ScrapePersistResult::AlreadyPartOfExistingStory => summary.existing += 1,
300 &ScrapePersistResult::NewStory => summary.new += 1,
301 &ScrapePersistResult::NotFound => summary.not_found += 1,
302 }
303 }
304 summary
305 }
306}
307
308#[derive(Clone, Debug)]
309pub enum PersistLocation {
311 Memory,
313 Path(PathBuf),
315}
316
317impl PersistLocation {
318 pub fn join<P: AsRef<std::path::Path>>(&self, p: P) -> PersistLocation {
319 match self {
320 PersistLocation::Memory => PersistLocation::Memory,
321 PersistLocation::Path(path) => PersistLocation::Path(path.join(p)),
322 }
323 }
324}