progscrape_scrapers/
collections.rs1use std::{
2 borrow::Cow,
3 collections::{hash_map::Entry, HashMap, HashSet},
4};
5
6use itertools::Itertools;
7use serde::{Deserialize, Serialize};
8
9use crate::{
10 backends::ScrapeCore, ScrapeExtractor, ScrapeId, ScrapeSource, StoryDate, StoryUrl, TypedScrape,
11};
12
13#[derive(Serialize, Deserialize)]
15pub struct ScrapeCollection {
16 pub earliest: StoryDate,
17
18 pub scrapes: HashMap<ScrapeId, TypedScrape>,
21}
22
23impl ScrapeCollection {
24 pub fn new_from_one(scrape: TypedScrape) -> Self {
25 Self {
26 earliest: scrape.date,
27 scrapes: HashMap::from_iter([(scrape.id.clone(), scrape)]),
28 }
29 }
30
31 pub fn new_from_iter(scrapes: impl Iterator<Item = TypedScrape>) -> Self {
32 let scrapes = HashMap::from_iter(scrapes.map(|s| (s.id.clone(), s)));
33 let earliest = scrapes
34 .values()
35 .map(|x| x.date)
36 .min()
37 .expect("Requires at least one TypedScrape");
38 Self { earliest, scrapes }
39 }
40
41 pub fn merge(&mut self, scrape: TypedScrape) {
43 self.earliest = self.earliest.min(scrape.date);
44 match self.scrapes.entry(scrape.id.clone()) {
45 Entry::Occupied(mut x) => {
46 x.get_mut().merge(scrape);
47 }
48 Entry::Vacant(x) => {
49 x.insert(scrape);
50 }
51 }
52 }
53
54 pub fn merge_all(&mut self, other: Self) {
56 for scrape in other.scrapes.into_values() {
57 self.merge(scrape)
58 }
59 }
60
61 pub fn url(&self) -> &StoryUrl {
62 &self
63 .scrapes
64 .values()
65 .next()
66 .expect("Requires at least one TypedScrape")
67 .url
68 }
69
70 pub fn extract<'a>(&'a self, extractor: &ScrapeExtractor) -> ExtractedScrapeCollection<'a> {
71 let title_score = |source: &ScrapeSource| {
72 match source {
73 ScrapeSource::HackerNews => 0,
75 ScrapeSource::Lobsters => 1,
76 ScrapeSource::Slashdot => 2,
77 ScrapeSource::Reddit => 3,
79 ScrapeSource::Other => 99,
80 }
81 };
82
83 let iter = self
84 .scrapes
85 .iter()
86 .map(|(k, v)| (k, (extractor.extract(v), v)));
87 let scrapes = HashMap::from_iter(iter);
88 let mut title_story = *scrapes
89 .iter()
90 .next()
91 .expect("Expected at least one scrape")
92 .0;
93 let mut max_title_score = i32::MAX;
94 for (id, (_, _)) in &scrapes {
95 let this_score = title_score(&id.source);
96 if this_score < max_title_score {
97 max_title_score = this_score;
98 title_story = *id;
99 }
100 }
101
102 ExtractedScrapeCollection {
103 earliest: self.earliest,
104 title_story,
105 scrapes,
106 }
107 }
108}
109
110pub struct ExtractedScrapeCollection<'a> {
112 pub earliest: StoryDate,
113 title_story: &'a ScrapeId,
114 pub scrapes: HashMap<&'a ScrapeId, (ScrapeCore<'a>, &'a TypedScrape)>,
115}
116
117impl<'a> ExtractedScrapeCollection<'a> {
118 pub fn title(&'a self) -> &'a str {
119 &self
120 .scrapes
121 .get(self.title_story)
122 .expect("Expected the title story to be in the scrape collection")
123 .0
124 .title
125 }
126
127 pub fn url(&'a self) -> &'a StoryUrl {
128 self.scrapes
129 .iter()
130 .next()
131 .expect("Expected at least one scrape")
132 .1
133 .0
134 .url
135 }
136
137 pub fn tags<'b>(&'b self) -> Vec<Cow<'a, str>> {
138 let mut tags = HashSet::new();
139 for (scrape, _) in self.scrapes.values() {
140 tags.extend(&scrape.tags);
141 }
142 tags.into_iter().cloned().collect_vec()
143 }
144 }