progscrape_scrapers/
collections.rs

1use std::{
2    borrow::Cow,
3    collections::{HashMap, HashSet, hash_map::Entry},
4};
5
6use itertools::Itertools;
7use serde::{Deserialize, Serialize};
8
9use crate::{
10    ScrapeExtractor, ScrapeId, ScrapeSource, StoryDate, StoryUrl, TypedScrape, backends::ScrapeCore,
11};
12
13/// Collection of scrapes, which can also extract the best title, etc.
14#[derive(Serialize, Deserialize, Debug)]
15pub struct ScrapeCollection {
16    pub earliest: StoryDate,
17
18    // TODO: We need to clone the scrape ID because we can't use a reference to the key, and making this a hash set
19    // prevents mutation/
20    pub scrapes: HashMap<ScrapeId, TypedScrape>,
21}
22
23impl ScrapeCollection {
24    pub fn new_from_one(scrape: TypedScrape) -> Self {
25        Self {
26            earliest: scrape.date,
27            scrapes: HashMap::from_iter([(scrape.id.clone(), scrape)]),
28        }
29    }
30
31    pub fn new_from_iter(scrapes: impl Iterator<Item = TypedScrape>) -> Self {
32        let scrapes = HashMap::from_iter(scrapes.map(|s| (s.id.clone(), s)));
33        let earliest = scrapes
34            .values()
35            .map(|x| x.date)
36            .min()
37            .expect("Requires at least one TypedScrape");
38        Self { earliest, scrapes }
39    }
40
41    /// Takes and merges another `TypedScrape`.
42    pub fn merge(&mut self, scrape: TypedScrape) {
43        self.earliest = self.earliest.min(scrape.date);
44        match self.scrapes.entry(scrape.id.clone()) {
45            Entry::Occupied(mut x) => {
46                x.get_mut().merge(scrape);
47            }
48            Entry::Vacant(x) => {
49                x.insert(scrape);
50            }
51        }
52    }
53
54    /// Takes and merges all the `TypedScrape`s from the other `ScrapeCollection`.
55    pub fn merge_all(&mut self, other: Self) {
56        for scrape in other.scrapes.into_values() {
57            self.merge(scrape)
58        }
59    }
60
61    pub fn url(&self) -> &StoryUrl {
62        &self
63            .scrapes
64            .values()
65            .next()
66            .expect("Requires at least one TypedScrape")
67            .url
68    }
69
70    pub fn extract<'a>(&'a self, extractor: &ScrapeExtractor) -> ExtractedScrapeCollection<'a> {
71        let title_score = |source: &ScrapeSource| {
72            match source {
73                // HN is moderated and titles are high quality
74                ScrapeSource::HackerNews => 0,
75                ScrapeSource::Feed => 1,
76                ScrapeSource::Lobsters => 2,
77                ScrapeSource::Slashdot => 3,
78                // User-submitted titles are generally just OK
79                ScrapeSource::Reddit => 4,
80                ScrapeSource::Other => 99,
81            }
82        };
83
84        let iter = self
85            .scrapes
86            .iter()
87            .map(|(k, v)| (k, (extractor.extract(v), v)));
88        let scrapes = HashMap::from_iter(iter);
89        let mut title_story = *scrapes
90            .iter()
91            .next()
92            .expect("Expected at least one scrape")
93            .0;
94        let mut max_title_score = i32::MAX;
95        for (id, (_, _)) in &scrapes {
96            let this_score = title_score(&id.source);
97            if this_score < max_title_score {
98                max_title_score = this_score;
99                title_story = *id;
100            }
101        }
102
103        ExtractedScrapeCollection {
104            earliest: self.earliest,
105            title_story,
106            scrapes,
107        }
108    }
109}
110
111/// Collection of scrape data that has been extracted from a `ScrapeCollection`.
112pub struct ExtractedScrapeCollection<'a> {
113    pub earliest: StoryDate,
114    title_story: &'a ScrapeId,
115    pub scrapes: HashMap<&'a ScrapeId, (ScrapeCore<'a>, &'a TypedScrape)>,
116}
117
118impl<'a> ExtractedScrapeCollection<'a> {
119    pub fn title(&'a self) -> &'a str {
120        &self
121            .scrapes
122            .get(self.title_story)
123            .expect("Expected the title story to be in the scrape collection")
124            .0
125            .title
126    }
127
128    pub fn url(&'a self) -> &'a StoryUrl {
129        self.scrapes
130            .iter()
131            .next()
132            .expect("Expected at least one scrape")
133            .1
134            .0
135            .url
136    }
137
138    pub fn tags<'b>(&'b self) -> Vec<Cow<'a, str>> {
139        let mut tags = HashSet::new();
140        for (scrape, _) in self.scrapes.values() {
141            tags.extend(&scrape.tags);
142        }
143        tags.into_iter().cloned().collect_vec()
144    }
145    // /// Choose a title based on source priority, with preference for shorter titles if the priority is the same.
146    // fn title_choice(&self) -> (ScrapeSource, Cow<str>) {
147    //     let title_score = |source: &ScrapeSource| {
148    //         match source {
149    //             // HN is moderated and titles are high quality
150    //             ScrapeSource::HackerNews => 0,
151    //             ScrapeSource::Lobsters => 1,
152    //             ScrapeSource::Slashdot => 2,
153    //             // User-submitted titles are generally just OK
154    //             ScrapeSource::Reddit => 3,
155    //             ScrapeSource::Other => 99,
156    //         }
157    //     };
158    //     let mut best_title = (99, &ScrapeSource::Other, Cow::Borrowed("Unknown title"));
159    //     for (id, scrape) in &self.scrapes {
160    //         let scrape = extractor.extract(scrape);
161    //         let score = title_score(&id.source);
162    //         if score < best_title.0 {
163    //             best_title = (score, &id.source, scrape.title);
164    //             continue;
165    //         }
166    //         let title = &scrape.title;
167    //         if score == best_title.0 && title.len() < best_title.2.len() {
168    //             best_title = (score, &id.source, scrape.title);
169    //             continue;
170    //         }
171    //     }
172    //     (*best_title.1, best_title.2)
173    // }
174}