progscrape_application/story/
mod.rs

1//! Stories begin as a `ScrapeCollection`, and we progressively analyze that collection to add further metdata,
2//! including tags, scores, and post-processing of the provided titles.
3use itertools::Itertools;
4use serde::{Deserialize, Serialize};
5
6use progscrape_scrapers::{
7    ScrapeConfig, ScrapeExtractor, ScrapeId, StoryDate, StoryUrl, TypedScrapeMap,
8};
9use std::collections::{HashMap, HashSet};
10
11mod collector;
12mod id;
13mod render;
14mod scorer;
15mod tagger;
16
17use crate::persist::Shard;
18
19pub use self::{
20    collector::StoryCollector,
21    id::StoryIdentifier,
22    render::StoryRender,
23    scorer::{StoryScore, StoryScoreConfig, StoryScorer},
24    tagger::{StoryTagger, TaggerConfig},
25};
26
27/// Required services to evaulate a story.
28pub struct StoryEvaluator {
29    pub tagger: StoryTagger,
30    pub scorer: StoryScorer,
31    pub extractor: ScrapeExtractor,
32}
33
34impl StoryEvaluator {
35    pub fn new(tagger: &TaggerConfig, scorer: &StoryScoreConfig, scrape: &ScrapeConfig) -> Self {
36        Self {
37            tagger: StoryTagger::new(tagger),
38            scorer: StoryScorer::new(scorer),
39            extractor: ScrapeExtractor::new(scrape),
40        }
41    }
42
43    #[cfg(test)]
44    pub fn new_for_test() -> Self {
45        Self::new(
46            &crate::story::tagger::test::tagger_config(),
47            &StoryScoreConfig::default(),
48            &ScrapeConfig::default(),
49        )
50    }
51}
52
53#[derive(Clone, Debug)]
54pub struct StoryScrapeId {
55    pub id: ScrapeId,
56    pub shard: Shard,
57}
58
59impl From<StoryScrapeId> for (ScrapeId, Shard) {
60    fn from(val: StoryScrapeId) -> Self {
61        (val.id, val.shard)
62    }
63}
64
65/// Story scrape w/information from underlying sources.
66#[derive(Clone, Debug, Deserialize, Serialize)]
67pub struct Story<S> {
68    pub id: StoryIdentifier,
69    pub score: f32,
70    pub date: StoryDate,
71    pub url: StoryUrl,
72    pub title: String,
73    pub tags: TagSet,
74    pub scrapes: HashMap<ScrapeId, S>,
75}
76
77impl<S> Story<S> {
78    pub fn new_from_parts(
79        title: String,
80        url: StoryUrl,
81        date: StoryDate,
82        score: f32,
83        tags: impl IntoIterator<Item = String>,
84        scrapes: impl IntoIterator<Item = impl Into<(ScrapeId, S)>>,
85    ) -> Self {
86        Self {
87            id: StoryIdentifier::new(date, url.normalization()),
88            tags: TagSet::from_iter(tags),
89            title,
90            url,
91            date,
92            score,
93            scrapes: HashMap::from_iter(scrapes.into_iter().map(|x| x.into())),
94        }
95    }
96
97    /// Compares two stories, ordering by score.
98    pub fn compare_score(&self, other: &Self) -> std::cmp::Ordering {
99        // Sort by score, but fall back to date if score is somehow a NaN (it shouldn't be, but we'll just be robust here)
100        f32::partial_cmp(&self.score, &other.score).unwrap_or_else(|| self.date.cmp(&other.date))
101    }
102
103    /// Compares two stories, ordering by date.
104    pub fn compare_date(&self, other: &Self) -> std::cmp::Ordering {
105        self.date.cmp(&other.date)
106    }
107
108    /// Is this story likely a self-post? See the description of `ScrapeId::is_likely_self_post` for details of the
109    /// heuristic and caveats.
110    pub fn is_likely_self_post(&self) -> bool {
111        for scrape_id in self.scrapes.keys() {
112            if scrape_id.is_likely_self_post(&self.url) {
113                return true;
114            }
115        }
116        false
117    }
118
119    pub fn raw_tags(&self) -> Vec<String> {
120        let mut tags = vec![self.url.host().to_owned()];
121        tags.extend(self.tags.dump());
122        tags
123    }
124
125    pub fn render_tags(&self, tagger: &StoryTagger) -> Vec<String> {
126        let mut tags = vec![self.url.host().to_owned()];
127        tags.extend(tagger.make_display_tags(self.tags.dump()));
128        tags
129    }
130
131    pub fn render(&self, eval: &StoryEvaluator, order: usize) -> StoryRender {
132        let mut sources = TypedScrapeMap::new();
133        for (id, _) in &self.scrapes {
134            sources.set(id.source, Some(id.clone()));
135        }
136        StoryRender {
137            order,
138            id: self.id.to_base64(),
139            score: self.score,
140            url: self.url.to_string(),
141            domain: self.url.host().to_owned(),
142            title: self.title.to_owned(),
143            date: self.date,
144            tags: self.render_tags(&eval.tagger),
145            sources,
146        }
147    }
148}
149
150#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
151pub struct TagSet {
152    set: HashSet<String>,
153}
154
155impl TagSet {
156    pub fn new() -> Self {
157        Self {
158            set: HashSet::new(),
159        }
160    }
161
162    pub fn from_iter<S: AsRef<str>>(iter: impl IntoIterator<Item = S>) -> Self {
163        Self {
164            set: HashSet::from_iter(iter.into_iter().map(|s| s.as_ref().to_owned())),
165        }
166    }
167
168    pub fn contains(&self, tag: impl AsRef<str>) -> bool {
169        self.set.contains(tag.as_ref())
170    }
171
172    pub fn add(&mut self, tag: impl AsRef<str>) {
173        self.set.insert(tag.as_ref().to_ascii_lowercase());
174    }
175
176    pub fn collect(&self) -> Vec<String> {
177        self.dump().collect()
178    }
179
180    pub fn dump(&self) -> impl Iterator<Item = String> + '_ {
181        self.set.iter().sorted().cloned()
182    }
183
184    pub fn iter(&self) -> impl Iterator<Item = &String> {
185        self.set.iter()
186    }
187}
188
189impl<'a> IntoIterator for &'a TagSet {
190    type IntoIter = <&'a HashSet<String> as IntoIterator>::IntoIter;
191    type Item = <&'a HashSet<String> as IntoIterator>::Item;
192
193    fn into_iter(self) -> Self::IntoIter {
194        self.set.iter()
195    }
196}
197
198impl IntoIterator for TagSet {
199    type IntoIter = <HashSet<String> as IntoIterator>::IntoIter;
200    type Item = <HashSet<String> as IntoIterator>::Item;
201
202    fn into_iter(self) -> Self::IntoIter {
203        self.set.into_iter()
204    }
205}
206
207impl TagAcceptor for TagSet {
208    fn tag(&mut self, s: &str) {
209        self.add(s);
210    }
211}
212
213pub trait TagAcceptor {
214    fn tag(&mut self, s: &str);
215}