progscrape_scrapers/
collections.rs

1use std::{
2    borrow::Cow,
3    collections::{hash_map::Entry, HashMap, HashSet},
4};
5
6use itertools::Itertools;
7use serde::{Deserialize, Serialize};
8
9use crate::{
10    backends::ScrapeCore, ScrapeExtractor, ScrapeId, ScrapeSource, StoryDate, StoryUrl, TypedScrape,
11};
12
13/// Collection of scrapes, which can also extract the best title, etc.
14#[derive(Serialize, Deserialize)]
15pub struct ScrapeCollection {
16    pub earliest: StoryDate,
17
18    // TODO: We need to clone the scrape ID because we can't use a reference to the key, and making this a hash set
19    // prevents mutation/
20    pub scrapes: HashMap<ScrapeId, TypedScrape>,
21}
22
23impl ScrapeCollection {
24    pub fn new_from_one(scrape: TypedScrape) -> Self {
25        Self {
26            earliest: scrape.date,
27            scrapes: HashMap::from_iter([(scrape.id.clone(), scrape)]),
28        }
29    }
30
31    pub fn new_from_iter(scrapes: impl Iterator<Item = TypedScrape>) -> Self {
32        let scrapes = HashMap::from_iter(scrapes.map(|s| (s.id.clone(), s)));
33        let earliest = scrapes
34            .values()
35            .map(|x| x.date)
36            .min()
37            .expect("Requires at least one TypedScrape");
38        Self { earliest, scrapes }
39    }
40
41    /// Takes and merges another `TypedScrape`.
42    pub fn merge(&mut self, scrape: TypedScrape) {
43        self.earliest = self.earliest.min(scrape.date);
44        match self.scrapes.entry(scrape.id.clone()) {
45            Entry::Occupied(mut x) => {
46                x.get_mut().merge(scrape);
47            }
48            Entry::Vacant(x) => {
49                x.insert(scrape);
50            }
51        }
52    }
53
54    /// Takes and merges all the `TypedScrape`s from the other `ScrapeCollection`.
55    pub fn merge_all(&mut self, other: Self) {
56        for scrape in other.scrapes.into_values() {
57            self.merge(scrape)
58        }
59    }
60
61    pub fn url(&self) -> &StoryUrl {
62        &self
63            .scrapes
64            .values()
65            .next()
66            .expect("Requires at least one TypedScrape")
67            .url
68    }
69
70    pub fn extract<'a>(&'a self, extractor: &ScrapeExtractor) -> ExtractedScrapeCollection<'a> {
71        let title_score = |source: &ScrapeSource| {
72            match source {
73                // HN is moderated and titles are high quality
74                ScrapeSource::HackerNews => 0,
75                ScrapeSource::Lobsters => 1,
76                ScrapeSource::Slashdot => 2,
77                // User-submitted titles are generally just OK
78                ScrapeSource::Reddit => 3,
79                ScrapeSource::Other => 99,
80            }
81        };
82
83        let iter = self
84            .scrapes
85            .iter()
86            .map(|(k, v)| (k, (extractor.extract(v), v)));
87        let scrapes = HashMap::from_iter(iter);
88        let mut title_story = *scrapes
89            .iter()
90            .next()
91            .expect("Expected at least one scrape")
92            .0;
93        let mut max_title_score = i32::MAX;
94        for (id, (_, _)) in &scrapes {
95            let this_score = title_score(&id.source);
96            if this_score < max_title_score {
97                max_title_score = this_score;
98                title_story = *id;
99            }
100        }
101
102        ExtractedScrapeCollection {
103            earliest: self.earliest,
104            title_story,
105            scrapes,
106        }
107    }
108}
109
110/// Collection of scrape data that has been extracted from a `ScrapeCollection`.
111pub struct ExtractedScrapeCollection<'a> {
112    pub earliest: StoryDate,
113    title_story: &'a ScrapeId,
114    pub scrapes: HashMap<&'a ScrapeId, (ScrapeCore<'a>, &'a TypedScrape)>,
115}
116
117impl<'a> ExtractedScrapeCollection<'a> {
118    pub fn title(&'a self) -> &'a str {
119        &self
120            .scrapes
121            .get(self.title_story)
122            .expect("Expected the title story to be in the scrape collection")
123            .0
124            .title
125    }
126
127    pub fn url(&'a self) -> &'a StoryUrl {
128        self.scrapes
129            .iter()
130            .next()
131            .expect("Expected at least one scrape")
132            .1
133             .0
134            .url
135    }
136
137    pub fn tags<'b>(&'b self) -> Vec<Cow<'a, str>> {
138        let mut tags = HashSet::new();
139        for (scrape, _) in self.scrapes.values() {
140            tags.extend(&scrape.tags);
141        }
142        tags.into_iter().cloned().collect_vec()
143    }
144    // /// Choose a title based on source priority, with preference for shorter titles if the priority is the same.
145    // fn title_choice(&self) -> (ScrapeSource, Cow<str>) {
146    //     let title_score = |source: &ScrapeSource| {
147    //         match source {
148    //             // HN is moderated and titles are high quality
149    //             ScrapeSource::HackerNews => 0,
150    //             ScrapeSource::Lobsters => 1,
151    //             ScrapeSource::Slashdot => 2,
152    //             // User-submitted titles are generally just OK
153    //             ScrapeSource::Reddit => 3,
154    //             ScrapeSource::Other => 99,
155    //         }
156    //     };
157    //     let mut best_title = (99, &ScrapeSource::Other, Cow::Borrowed("Unknown title"));
158    //     for (id, scrape) in &self.scrapes {
159    //         let scrape = extractor.extract(scrape);
160    //         let score = title_score(&id.source);
161    //         if score < best_title.0 {
162    //             best_title = (score, &id.source, scrape.title);
163    //             continue;
164    //         }
165    //         let title = &scrape.title;
166    //         if score == best_title.0 && title.len() < best_title.2.len() {
167    //             best_title = (score, &id.source, scrape.title);
168    //             continue;
169    //         }
170    //     }
171    //     (*best_title.1, best_title.2)
172    // }
173}