progscrape_application/story/
scorer.rs

1use serde::{Deserialize, Serialize};
2
3use progscrape_scrapers::{
4    ExtractedScrapeCollection, ScrapeCore, ScrapeSource, StoryDate, StoryDuration, TypedScrape,
5    TypedScrapeMap,
6};
7
8use super::Story;
9
10#[derive(Clone, Default, Serialize, Deserialize)]
11pub struct StoryScoreMultiSourceConfig {
12    power: f32,
13    factor: f32,
14}
15
16#[derive(Clone, Default, Serialize, Deserialize)]
17pub struct StoryScoreConfig {
18    age_breakpoint_days: [u32; 2],
19    hour_scores: [f32; 3],
20    service_rank: TypedScrapeMap<f32>,
21    service_boost: TypedScrapeMap<f32>,
22    multi_source: StoryScoreMultiSourceConfig,
23}
24
25pub enum StoryScoreType {
26    Base,
27    AgedFrom(StoryDate),
28}
29
30#[derive(Debug)]
31pub enum StoryScore {
32    Age,
33    Random,
34    SourceCount,
35    LongRedditTitle,
36    LongTitle,
37    ImageLink,
38    SelfLink,
39    PoorUpvoteRatio,
40    UpvoteCount,
41    CommentCount,
42    Position(ScrapeSource),
43    Source(ScrapeSource),
44}
45
46impl Serialize for StoryScore {
47    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
48    where
49        S: serde::Serializer,
50    {
51        format!("{:?}", self).serialize(serializer)
52    }
53}
54
55pub struct StoryScorer {
56    config: StoryScoreConfig,
57}
58
59trait ServiceScorer {}
60
61// impl ServiceScorer for Generic
62
63impl StoryScorer {
64    pub fn new(config: &StoryScoreConfig) -> Self {
65        Self {
66            config: config.clone(),
67        }
68    }
69
70    /// Re-scores stories w/age score.
71    pub fn resort_stories<S>(&self, relative_to: StoryDate, stories: &mut [Story<S>]) {
72        let new_score =
73            move |story: &Story<S>| story.score + self.score_age(relative_to - story.date);
74
75        stories.sort_by_cached_key(|story| (new_score(story) * -100000.0) as i64);
76    }
77
78    #[inline(always)]
79    pub fn score_age(&self, age: StoryDuration) -> f32 {
80        let breakpoint1 = StoryDuration::days(self.config.age_breakpoint_days[0] as i64);
81        let breakpoint2 = StoryDuration::days(self.config.age_breakpoint_days[1] as i64);
82        let hour_score0 = self.config.hour_scores[0];
83        let hour_score1 = self.config.hour_scores[1];
84        let hour_score2 = self.config.hour_scores[2];
85
86        // Equivalent to Duration::hours(1).num_milliseconds() as f32;
87        const MILLIS_TO_HOURS: f32 = 60.0 * 60.0 * 1000.0;
88
89        // Fractional hours, clamped to zero
90        let fractional_hours = f32::max(0.0, age.num_milliseconds() as f32 / MILLIS_TO_HOURS);
91
92        if age < breakpoint1 {
93            fractional_hours * hour_score0
94        } else if age < breakpoint2 {
95            breakpoint1.num_hours() as f32 * hour_score0
96                + (fractional_hours - breakpoint1.num_hours() as f32) * hour_score1
97        } else {
98            breakpoint1.num_hours() as f32 * hour_score0
99                + (breakpoint2 - breakpoint1).num_hours() as f32 * hour_score1
100                + (fractional_hours - breakpoint2.num_hours() as f32) * hour_score2
101        }
102    }
103
104    /// Score a single scrape so that we can evaluate which of multiple stories we want to
105    /// choose.
106    #[inline(always)]
107    fn score_single<T: FnMut(StoryScore, f32)>(
108        &self,
109        scrape: &TypedScrape,
110        core: &ScrapeCore,
111        mut accum: T,
112    ) {
113        use StoryScore::*;
114
115        let url = core.url;
116
117        let source = scrape.id.source;
118        if let Some(rank) = core.rank {
119            accum(
120                Position(source),
121                (30.0 - rank.clamp(0, 30) as f32) * self.config.service_rank.get(source),
122            );
123        }
124        let boost = *self.config.service_boost.get(source);
125        if boost > f32::EPSILON {
126            accum(Source(source), boost);
127        }
128        if url.host().contains("gfycat")
129            || url.host().contains("imgur")
130            || url.host().contains("i.reddit.com")
131        {
132            if source == ScrapeSource::HackerNews {
133                accum(ImageLink, -5.0);
134            } else {
135                accum(ImageLink, -10.0);
136            }
137        }
138
139        match scrape {
140            TypedScrape::HackerNews(hn) => {
141                if hn.data.comments > 100 {
142                    accum(CommentCount, 5.0);
143                }
144            }
145            TypedScrape::Reddit(reddit) => {
146                // Penalize Reddit self links
147                if url.host().contains("reddit.com") {
148                    accum(SelfLink, -20.0);
149                }
150
151                // Penalize a long title if reddit is a source
152                if core.title.len() > 130 {
153                    accum(LongRedditTitle, -5.0);
154                }
155
156                if reddit.data.upvote_ratio < 0.6 {
157                    accum(PoorUpvoteRatio, -20.0);
158                }
159                if reddit.data.upvotes < 10 {
160                    accum(UpvoteCount, -20.0);
161                } else if reddit.data.upvotes > 10 {
162                    accum(UpvoteCount, 10.0);
163                } else if reddit.data.upvotes > 100 {
164                    accum(UpvoteCount, 15.0);
165                }
166                if reddit.data.num_comments < 10 {
167                    accum(CommentCount, -5.0);
168                } else if reddit.data.num_comments > 10 {
169                    accum(CommentCount, 5.0);
170                }
171            }
172            TypedScrape::Lobsters(lobsters) => {
173                // This won't get triggered until we start scraping lobsters comment counts
174                if lobsters.data.num_comments > 100 {
175                    accum(CommentCount, 5.0);
176                }
177            }
178            TypedScrape::Slashdot(slashdot) => {
179                if slashdot.data.num_comments > 100 {
180                    accum(CommentCount, 5.0);
181                }
182            }
183        }
184    }
185
186    #[inline(always)]
187    fn score_impl<T: FnMut(StoryScore, f32)>(
188        &self,
189        scrapes: &ExtractedScrapeCollection,
190        best: TypedScrapeMap<Option<(&TypedScrape, &ScrapeCore, f32)>>,
191        mut accum: T,
192    ) {
193        use StoryScore::*;
194
195        let title = scrapes.title();
196        let url = scrapes.url();
197
198        // Small random shuffle for stories to mix up the front page a bit
199        accum(
200            Random,
201            (url.normalization().hash() % 6000000) as f32 / 1000000.0,
202        );
203
204        accum(
205            SourceCount,
206            (scrapes.scrapes.len() as f32).powf(self.config.multi_source.power)
207                * self.config.multi_source.factor,
208        );
209
210        for (scrape, core, _) in best.values().flatten() {
211            self.score_single(scrape, core, &mut accum);
212        }
213
214        // Penalize a really long title regardless of source
215        if title.len() > 250 {
216            accum(LongTitle, -15.0);
217        }
218    }
219
220    fn calculate_best_scrapes<'a, 'b>(
221        &self,
222        scrapes: &'a ExtractedScrapeCollection<'b>,
223    ) -> TypedScrapeMap<Option<(&'a TypedScrape, &'a ScrapeCore<'b>, f32)>> {
224        let mut service_scrapes = TypedScrapeMap::new();
225        for (id, (core, scrape)) in &scrapes.scrapes {
226            let mut score_total = 0_f32;
227            let accum = |_, score| score_total += score;
228            self.score_single(scrape, core, accum);
229            if let Some((_, _, existing_score)) = service_scrapes.get(id.source) {
230                if *existing_score > score_total {
231                    continue;
232                }
233            }
234            service_scrapes.set(id.source, Some((*scrape, core, score_total)));
235        }
236        service_scrapes
237    }
238
239    pub fn score(&self, scrapes: &ExtractedScrapeCollection) -> f32 {
240        let best = self.calculate_best_scrapes(scrapes);
241        let mut score_total = 0_f32;
242        let accum = |_, score| score_total += score;
243        self.score_impl(scrapes, best, accum);
244        score_total
245    }
246
247    pub fn score_detail(
248        &self,
249        scrapes: &ExtractedScrapeCollection,
250        now: StoryDate,
251    ) -> Vec<(StoryScore, f32)> {
252        let best = self.calculate_best_scrapes(scrapes);
253        let mut score_bits = vec![];
254        let mut accum = |score_type, score| score_bits.push((score_type, score));
255        accum(StoryScore::Age, self.score_age(now - scrapes.earliest));
256        self.score_impl(scrapes, best, accum);
257        score_bits
258    }
259}
260
261#[cfg(test)]
262mod test {
263    use super::*;
264
265    /// Make sure that the scores are decreasing.
266    #[test]
267    fn test_age_score() {
268        let config = StoryScoreConfig {
269            age_breakpoint_days: [1, 30],
270            hour_scores: [-5.0, -3.0, -0.1],
271            service_rank: TypedScrapeMap::new_with_all(1.0),
272            service_boost: TypedScrapeMap::new_with_all(1.0),
273            multi_source: StoryScoreMultiSourceConfig {
274                power: 2.0,
275                factor: 10.0,
276            },
277        };
278        let mut last_score = f32::MAX;
279        let scorer = StoryScorer::new(&config);
280        for i in 0..StoryDuration::days(60).num_hours() {
281            let score = scorer.score_age(StoryDuration::hours(i));
282            assert!(score < last_score, "{} < {}", score, last_score);
283            last_score = score;
284        }
285    }
286}