progscrape_application/story/
scorer.rs1use serde::{Deserialize, Serialize};
2
3use progscrape_scrapers::{
4 ExtractedScrapeCollection, ScrapeCore, ScrapeSource, StoryDate, StoryDuration, TypedScrape,
5 TypedScrapeMap,
6};
7
8use super::Story;
9
10#[derive(Clone, Default, Serialize, Deserialize)]
11pub struct StoryScoreMultiSourceConfig {
12 power: f32,
13 factor: f32,
14}
15
16#[derive(Clone, Default, Serialize, Deserialize)]
17pub struct StoryScoreConfig {
18 age_breakpoint_days: [u32; 2],
19 hour_scores: [f32; 3],
20 service_rank: TypedScrapeMap<f32>,
21 service_boost: TypedScrapeMap<f32>,
22 multi_source: StoryScoreMultiSourceConfig,
23}
24
25pub enum StoryScoreType {
26 Base,
27 AgedFrom(StoryDate),
28}
29
30#[derive(Debug)]
31pub enum StoryScore {
32 Age,
33 Random,
34 SourceCount,
35 LongRedditTitle,
36 LongTitle,
37 ImageLink,
38 SelfLink,
39 PoorUpvoteRatio,
40 UpvoteCount,
41 CommentCount,
42 Position(ScrapeSource),
43 Source(ScrapeSource),
44}
45
46impl Serialize for StoryScore {
47 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
48 where
49 S: serde::Serializer,
50 {
51 format!("{:?}", self).serialize(serializer)
52 }
53}
54
55pub struct StoryScorer {
56 config: StoryScoreConfig,
57}
58
59trait ServiceScorer {}
60
61impl StoryScorer {
64 pub fn new(config: &StoryScoreConfig) -> Self {
65 Self {
66 config: config.clone(),
67 }
68 }
69
70 pub fn resort_stories<S>(&self, relative_to: StoryDate, stories: &mut [Story<S>]) {
72 let new_score =
73 move |story: &Story<S>| story.score + self.score_age(relative_to - story.date);
74
75 stories.sort_by_cached_key(|story| (new_score(story) * -100000.0) as i64);
76 }
77
78 #[inline(always)]
79 pub fn score_age(&self, age: StoryDuration) -> f32 {
80 let breakpoint1 = StoryDuration::days(self.config.age_breakpoint_days[0] as i64);
81 let breakpoint2 = StoryDuration::days(self.config.age_breakpoint_days[1] as i64);
82 let hour_score0 = self.config.hour_scores[0];
83 let hour_score1 = self.config.hour_scores[1];
84 let hour_score2 = self.config.hour_scores[2];
85
86 const MILLIS_TO_HOURS: f32 = 60.0 * 60.0 * 1000.0;
88
89 let fractional_hours = f32::max(0.0, age.num_milliseconds() as f32 / MILLIS_TO_HOURS);
91
92 if age < breakpoint1 {
93 fractional_hours * hour_score0
94 } else if age < breakpoint2 {
95 breakpoint1.num_hours() as f32 * hour_score0
96 + (fractional_hours - breakpoint1.num_hours() as f32) * hour_score1
97 } else {
98 breakpoint1.num_hours() as f32 * hour_score0
99 + (breakpoint2 - breakpoint1).num_hours() as f32 * hour_score1
100 + (fractional_hours - breakpoint2.num_hours() as f32) * hour_score2
101 }
102 }
103
104 #[inline(always)]
107 fn score_single<T: FnMut(StoryScore, f32)>(
108 &self,
109 scrape: &TypedScrape,
110 core: &ScrapeCore,
111 mut accum: T,
112 ) {
113 use StoryScore::*;
114
115 let url = core.url;
116
117 let source = scrape.id.source;
118 if let Some(rank) = core.rank {
119 accum(
120 Position(source),
121 (30.0 - rank.clamp(0, 30) as f32) * self.config.service_rank.get(source),
122 );
123 }
124 let boost = *self.config.service_boost.get(source);
125 if boost > f32::EPSILON {
126 accum(Source(source), boost);
127 }
128 if url.host().contains("gfycat")
129 || url.host().contains("imgur")
130 || url.host().contains("i.reddit.com")
131 {
132 if source == ScrapeSource::HackerNews {
133 accum(ImageLink, -5.0);
134 } else {
135 accum(ImageLink, -10.0);
136 }
137 }
138
139 match scrape {
140 TypedScrape::HackerNews(hn) => {
141 if hn.data.comments > 100 {
142 accum(CommentCount, 5.0);
143 }
144 }
145 TypedScrape::Reddit(reddit) => {
146 if url.host().contains("reddit.com") {
148 accum(SelfLink, -20.0);
149 }
150
151 if core.title.len() > 130 {
153 accum(LongRedditTitle, -5.0);
154 }
155
156 if reddit.data.upvote_ratio < 0.6 {
157 accum(PoorUpvoteRatio, -20.0);
158 }
159 if reddit.data.upvotes < 10 {
160 accum(UpvoteCount, -20.0);
161 } else if reddit.data.upvotes > 10 {
162 accum(UpvoteCount, 10.0);
163 } else if reddit.data.upvotes > 100 {
164 accum(UpvoteCount, 15.0);
165 }
166 if reddit.data.num_comments < 10 {
167 accum(CommentCount, -5.0);
168 } else if reddit.data.num_comments > 10 {
169 accum(CommentCount, 5.0);
170 }
171 }
172 TypedScrape::Lobsters(lobsters) => {
173 if lobsters.data.num_comments > 100 {
175 accum(CommentCount, 5.0);
176 }
177 }
178 TypedScrape::Slashdot(slashdot) => {
179 if slashdot.data.num_comments > 100 {
180 accum(CommentCount, 5.0);
181 }
182 }
183 }
184 }
185
186 #[inline(always)]
187 fn score_impl<T: FnMut(StoryScore, f32)>(
188 &self,
189 scrapes: &ExtractedScrapeCollection,
190 best: TypedScrapeMap<Option<(&TypedScrape, &ScrapeCore, f32)>>,
191 mut accum: T,
192 ) {
193 use StoryScore::*;
194
195 let title = scrapes.title();
196 let url = scrapes.url();
197
198 accum(
200 Random,
201 (url.normalization().hash() % 6000000) as f32 / 1000000.0,
202 );
203
204 accum(
205 SourceCount,
206 (scrapes.scrapes.len() as f32).powf(self.config.multi_source.power)
207 * self.config.multi_source.factor,
208 );
209
210 for (scrape, core, _) in best.values().flatten() {
211 self.score_single(scrape, core, &mut accum);
212 }
213
214 if title.len() > 250 {
216 accum(LongTitle, -15.0);
217 }
218 }
219
220 fn calculate_best_scrapes<'a, 'b>(
221 &self,
222 scrapes: &'a ExtractedScrapeCollection<'b>,
223 ) -> TypedScrapeMap<Option<(&'a TypedScrape, &'a ScrapeCore<'b>, f32)>> {
224 let mut service_scrapes = TypedScrapeMap::new();
225 for (id, (core, scrape)) in &scrapes.scrapes {
226 let mut score_total = 0_f32;
227 let accum = |_, score| score_total += score;
228 self.score_single(scrape, core, accum);
229 if let Some((_, _, existing_score)) = service_scrapes.get(id.source) {
230 if *existing_score > score_total {
231 continue;
232 }
233 }
234 service_scrapes.set(id.source, Some((*scrape, core, score_total)));
235 }
236 service_scrapes
237 }
238
239 pub fn score(&self, scrapes: &ExtractedScrapeCollection) -> f32 {
240 let best = self.calculate_best_scrapes(scrapes);
241 let mut score_total = 0_f32;
242 let accum = |_, score| score_total += score;
243 self.score_impl(scrapes, best, accum);
244 score_total
245 }
246
247 pub fn score_detail(
248 &self,
249 scrapes: &ExtractedScrapeCollection,
250 now: StoryDate,
251 ) -> Vec<(StoryScore, f32)> {
252 let best = self.calculate_best_scrapes(scrapes);
253 let mut score_bits = vec![];
254 let mut accum = |score_type, score| score_bits.push((score_type, score));
255 accum(StoryScore::Age, self.score_age(now - scrapes.earliest));
256 self.score_impl(scrapes, best, accum);
257 score_bits
258 }
259}
260
261#[cfg(test)]
262mod test {
263 use super::*;
264
265 #[test]
267 fn test_age_score() {
268 let config = StoryScoreConfig {
269 age_breakpoint_days: [1, 30],
270 hour_scores: [-5.0, -3.0, -0.1],
271 service_rank: TypedScrapeMap::new_with_all(1.0),
272 service_boost: TypedScrapeMap::new_with_all(1.0),
273 multi_source: StoryScoreMultiSourceConfig {
274 power: 2.0,
275 factor: 10.0,
276 },
277 };
278 let mut last_score = f32::MAX;
279 let scorer = StoryScorer::new(&config);
280 for i in 0..StoryDuration::days(60).num_hours() {
281 let score = scorer.score_age(StoryDuration::hours(i));
282 assert!(score < last_score, "{} < {}", score, last_score);
283 last_score = score;
284 }
285 }
286}