progscrape_application/story/
mod.rs1use itertools::Itertools;
4use serde::{Deserialize, Serialize};
5
6use progscrape_scrapers::{
7 ScrapeConfig, ScrapeExtractor, ScrapeId, StoryDate, StoryUrl, TypedScrapeMap,
8};
9use std::collections::{HashMap, HashSet};
10
11mod collector;
12mod id;
13mod render;
14mod scorer;
15mod tagger;
16
17use crate::persist::Shard;
18
19pub use self::{
20 collector::StoryCollector,
21 id::StoryIdentifier,
22 render::StoryRender,
23 scorer::{StoryScore, StoryScoreConfig, StoryScorer},
24 tagger::{StoryTagger, TaggerConfig},
25};
26
27pub struct StoryEvaluator {
29 pub tagger: StoryTagger,
30 pub scorer: StoryScorer,
31 pub extractor: ScrapeExtractor,
32}
33
34impl StoryEvaluator {
35 pub fn new(tagger: &TaggerConfig, scorer: &StoryScoreConfig, scrape: &ScrapeConfig) -> Self {
36 Self {
37 tagger: StoryTagger::new(tagger),
38 scorer: StoryScorer::new(scorer),
39 extractor: ScrapeExtractor::new(scrape),
40 }
41 }
42
43 #[cfg(test)]
44 pub fn new_for_test() -> Self {
45 Self::new(
46 &crate::story::tagger::test::tagger_config(),
47 &StoryScoreConfig::default(),
48 &ScrapeConfig::default(),
49 )
50 }
51}
52
53#[derive(Clone, Debug)]
54pub struct StoryScrapeId {
55 pub id: ScrapeId,
56 pub shard: Shard,
57}
58
59impl From<StoryScrapeId> for (ScrapeId, Shard) {
60 fn from(val: StoryScrapeId) -> Self {
61 (val.id, val.shard)
62 }
63}
64
65#[derive(Clone, Debug, Deserialize, Serialize)]
67pub struct Story<S> {
68 pub id: StoryIdentifier,
69 pub score: f32,
70 pub date: StoryDate,
71 pub url: StoryUrl,
72 pub title: String,
73 pub tags: TagSet,
74 pub scrapes: HashMap<ScrapeId, S>,
75}
76
77impl<S> Story<S> {
78 pub fn new_from_parts(
79 title: String,
80 url: StoryUrl,
81 date: StoryDate,
82 score: f32,
83 tags: impl IntoIterator<Item = String>,
84 scrapes: impl IntoIterator<Item = impl Into<(ScrapeId, S)>>,
85 ) -> Self {
86 Self {
87 id: StoryIdentifier::new(date, url.normalization()),
88 tags: TagSet::from_iter(tags),
89 title,
90 url,
91 date,
92 score,
93 scrapes: HashMap::from_iter(scrapes.into_iter().map(|x| x.into())),
94 }
95 }
96
97 pub fn compare_score(&self, other: &Self) -> std::cmp::Ordering {
99 f32::partial_cmp(&self.score, &other.score).unwrap_or_else(|| self.date.cmp(&other.date))
101 }
102
103 pub fn compare_date(&self, other: &Self) -> std::cmp::Ordering {
105 self.date.cmp(&other.date)
106 }
107
108 pub fn is_likely_self_post(&self) -> bool {
111 for scrape_id in self.scrapes.keys() {
112 if scrape_id.is_likely_self_post(&self.url) {
113 return true;
114 }
115 }
116 false
117 }
118
119 pub fn raw_tags(&self) -> Vec<String> {
120 let mut tags = vec![self.url.host().to_owned()];
121 tags.extend(self.tags.dump());
122 tags
123 }
124
125 pub fn render_tags(&self, tagger: &StoryTagger) -> Vec<String> {
126 let mut tags = vec![self.url.host().to_owned()];
127 tags.extend(tagger.make_display_tags(self.tags.dump()));
128 tags
129 }
130
131 pub fn render(&self, eval: &StoryEvaluator, order: usize) -> StoryRender {
132 let mut sources = TypedScrapeMap::new();
133 for (id, _) in &self.scrapes {
134 sources.set(id.source, Some(id.clone()));
135 }
136 StoryRender {
137 order,
138 id: self.id.to_base64(),
139 score: self.score,
140 url: self.url.to_string(),
141 domain: self.url.host().to_owned(),
142 title: self.title.to_owned(),
143 date: self.date,
144 tags: self.render_tags(&eval.tagger),
145 sources,
146 }
147 }
148}
149
150#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
151pub struct TagSet {
152 set: HashSet<String>,
153}
154
155impl TagSet {
156 pub fn new() -> Self {
157 Self {
158 set: HashSet::new(),
159 }
160 }
161
162 pub fn from_iter<S: AsRef<str>>(iter: impl IntoIterator<Item = S>) -> Self {
163 Self {
164 set: HashSet::from_iter(iter.into_iter().map(|s| s.as_ref().to_owned())),
165 }
166 }
167
168 pub fn contains(&self, tag: impl AsRef<str>) -> bool {
169 self.set.contains(tag.as_ref())
170 }
171
172 pub fn add(&mut self, tag: impl AsRef<str>) {
173 self.set.insert(tag.as_ref().to_ascii_lowercase());
174 }
175
176 pub fn collect(&self) -> Vec<String> {
177 self.dump().collect()
178 }
179
180 pub fn dump(&self) -> impl Iterator<Item = String> + '_ {
181 self.set.iter().sorted().cloned()
182 }
183
184 pub fn iter(&self) -> impl Iterator<Item = &String> {
185 self.set.iter()
186 }
187}
188
189impl<'a> IntoIterator for &'a TagSet {
190 type IntoIter = <&'a HashSet<String> as IntoIterator>::IntoIter;
191 type Item = <&'a HashSet<String> as IntoIterator>::Item;
192
193 fn into_iter(self) -> Self::IntoIter {
194 self.set.iter()
195 }
196}
197
198impl IntoIterator for TagSet {
199 type IntoIter = <HashSet<String> as IntoIterator>::IntoIter;
200 type Item = <HashSet<String> as IntoIterator>::Item;
201
202 fn into_iter(self) -> Self::IntoIter {
203 self.set.into_iter()
204 }
205}
206
207impl TagAcceptor for TagSet {
208 fn tag(&mut self, s: &str) {
209 self.add(s);
210 }
211}
212
213pub trait TagAcceptor {
214 fn tag(&mut self, s: &str);
215}