progscrape_scrapers/
collections.rs1use std::{
2 borrow::Cow,
3 collections::{HashMap, HashSet, hash_map::Entry},
4};
5
6use itertools::Itertools;
7use serde::{Deserialize, Serialize};
8
9use crate::{
10 ScrapeExtractor, ScrapeId, ScrapeSource, StoryDate, StoryUrl, TypedScrape, backends::ScrapeCore,
11};
12
13#[derive(Serialize, Deserialize, Debug)]
15pub struct ScrapeCollection {
16 pub earliest: StoryDate,
17
18 pub scrapes: HashMap<ScrapeId, TypedScrape>,
21}
22
23impl ScrapeCollection {
24 pub fn new_from_one(scrape: TypedScrape) -> Self {
25 Self {
26 earliest: scrape.date,
27 scrapes: HashMap::from_iter([(scrape.id.clone(), scrape)]),
28 }
29 }
30
31 pub fn new_from_iter(scrapes: impl Iterator<Item = TypedScrape>) -> Self {
32 let scrapes = HashMap::from_iter(scrapes.map(|s| (s.id.clone(), s)));
33 let earliest = scrapes
34 .values()
35 .map(|x| x.date)
36 .min()
37 .expect("Requires at least one TypedScrape");
38 Self { earliest, scrapes }
39 }
40
41 pub fn merge(&mut self, scrape: TypedScrape) {
43 self.earliest = self.earliest.min(scrape.date);
44 match self.scrapes.entry(scrape.id.clone()) {
45 Entry::Occupied(mut x) => {
46 x.get_mut().merge(scrape);
47 }
48 Entry::Vacant(x) => {
49 x.insert(scrape);
50 }
51 }
52 }
53
54 pub fn merge_all(&mut self, other: Self) {
56 for scrape in other.scrapes.into_values() {
57 self.merge(scrape)
58 }
59 }
60
61 pub fn url(&self) -> &StoryUrl {
62 &self
63 .scrapes
64 .values()
65 .next()
66 .expect("Requires at least one TypedScrape")
67 .url
68 }
69
70 pub fn extract<'a>(&'a self, extractor: &ScrapeExtractor) -> ExtractedScrapeCollection<'a> {
71 let title_score = |source: &ScrapeSource| {
72 match source {
73 ScrapeSource::HackerNews => 0,
75 ScrapeSource::Feed => 1,
76 ScrapeSource::Lobsters => 2,
77 ScrapeSource::Slashdot => 3,
78 ScrapeSource::Reddit => 4,
80 ScrapeSource::Other => 99,
81 }
82 };
83
84 let iter = self
85 .scrapes
86 .iter()
87 .map(|(k, v)| (k, (extractor.extract(v), v)));
88 let scrapes = HashMap::from_iter(iter);
89 let mut title_story = *scrapes
90 .iter()
91 .next()
92 .expect("Expected at least one scrape")
93 .0;
94 let mut max_title_score = i32::MAX;
95 for (id, (_, _)) in &scrapes {
96 let this_score = title_score(&id.source);
97 if this_score < max_title_score {
98 max_title_score = this_score;
99 title_story = *id;
100 }
101 }
102
103 ExtractedScrapeCollection {
104 earliest: self.earliest,
105 title_story,
106 scrapes,
107 }
108 }
109}
110
111pub struct ExtractedScrapeCollection<'a> {
113 pub earliest: StoryDate,
114 title_story: &'a ScrapeId,
115 pub scrapes: HashMap<&'a ScrapeId, (ScrapeCore<'a>, &'a TypedScrape)>,
116}
117
118impl<'a> ExtractedScrapeCollection<'a> {
119 pub fn title(&'a self) -> &'a str {
120 &self
121 .scrapes
122 .get(self.title_story)
123 .expect("Expected the title story to be in the scrape collection")
124 .0
125 .title
126 }
127
128 pub fn url(&'a self) -> &'a StoryUrl {
129 self.scrapes
130 .iter()
131 .next()
132 .expect("Expected at least one scrape")
133 .1
134 .0
135 .url
136 }
137
138 pub fn tags<'b>(&'b self) -> Vec<Cow<'a, str>> {
139 let mut tags = HashSet::new();
140 for (scrape, _) in self.scrapes.values() {
141 tags.extend(&scrape.tags);
142 }
143 tags.into_iter().cloned().collect_vec()
144 }
145 }