progscrape_scrapers/backends/
mod.rs1use serde::{ser::SerializeMap, Deserialize, Serialize};
2use std::{borrow::Cow, fmt::Debug};
3
4pub use self::def::ScrapeCore;
5pub(crate) use self::def::*;
6use crate::types::*;
7
8mod def;
9pub mod hacker_news;
10pub mod legacy;
11pub mod lobsters;
12pub mod reddit;
13pub mod slashdot;
14mod utils;
15
16macro_rules! scrapers {
17 ($($package:ident :: $name:ident ,)*) => {
18 pub mod export {
19 $( pub use super::$package; )*
20 }
21
22 pub fn scrape(
23 config: &ScrapeConfig,
24 source: ScrapeSource,
25 input: &str,
26 ) -> Result<(Vec<TypedScrape>, Vec<String>), ScrapeError> {
27 match source {
28 $(
29 ScrapeSource::$name => {
30 let scraper = <$package::$name as ScrapeSourceDef>::Scraper::default();
31 let (res, warnings) = scraper.scrape(&config.$package, input)?;
32 Ok((res.into_iter().map(|x| x.into()).collect(), warnings))
33 },
34 )*
35 ScrapeSource::Other => unreachable!(),
36 }
37 }
38
39 #[derive(Clone, Default, Serialize, Deserialize)]
41 pub struct ScrapeConfig {
42 $(
43 #[doc="Configuration for the "]
44 #[doc=stringify!($name)]
45 #[doc=" backend."]
46 pub $package : <$package :: $name as ScrapeSourceDef>::Config
47 ),*
48 }
49
50 impl ScrapeConfig {
51 pub fn get(&self, source: ScrapeSource) -> Option<&dyn ScrapeConfigSource> {
52 match source {
53 $( ScrapeSource::$name => Some(&self.$package), )*
54 ScrapeSource::Other => None,
55 }
56 }
57 }
58
59 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)]
60 pub enum ScrapeSource {
61 $($name,)*
62 Other,
63 }
64
65 impl ScrapeSource {
66 pub fn into_str(&self) -> &'static str {
67 match self {
68 $(Self::$name => stringify!($package),)*
69 Self::Other => "other",
70 }
71 }
72
73 pub fn try_from_str(s: &str) -> Option<Self> {
74 match s {
75 $(stringify!($package) => Some(Self::$name),)*
76 "other" => Some(Self::Other),
77 _ => None,
78 }
79 }
80
81 pub const fn all() -> &'static [ScrapeSource] {
82 &[$(Self::$name),*]
83 }
84
85 pub fn comments_url(&self, id: &str, subsource: Option<&str>) -> String {
86 match self {
87 $(Self::$name => $package :: $name :: comments_url(id, subsource),)*
88 _ => unimplemented!()
89 }
90 }
91
92 pub fn id_from_comments_url<'a, 'b>(&'a self, url: &'b str) -> Option<ScrapeId> {
93 match self {
94 $(Self::$name => {
95 let (source, subsource) = $package :: $name :: id_from_comments_url(url)?;
96 Some(ScrapeId :: new( *self, subsource.map(|s| s.to_owned()), source.to_owned() ))
97 },)*
98 _ => unimplemented!()
99 }
100 }
101
102 pub fn is_comments_host(&self, host: &str) -> bool {
103 match self {
104 $(Self::$name => $package :: $name :: is_comments_host(host),)*
105 _ => unimplemented!()
106 }
107 }
108
109 pub fn id<'a, ID: Clone + Into<Cow<'a, str>>>(&self, id: ID) -> ScrapeId {
110 ScrapeId::new(*self, None, id.into().into())
111 }
112
113 pub fn subsource_id<'a, ID: Clone + Into<Cow<'a, str>>>(&self, subsource: ID, id: ID) -> ScrapeId {
114 ScrapeId::new(*self, Some(subsource.into().into()), id.into().into())
115 }
116 }
117
118 #[derive(Clone, Debug, Deserialize, Serialize)]
119 pub enum TypedScrape {
120 $( $name (GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>), )*
121 }
122
123 impl TypedScrape {
124 pub fn merge(&mut self, b: Self) {
125 match (self, b) {
126 $( (Self::$name(a), Self::$name(b)) => a.merge_generic(b), )*
127 (_a, _b) => {
128 }
132 }
133 }
134
135 pub(crate) fn extract(&self, config: &ScrapeConfig) -> ScrapeCore {
136 match self {
137 $(
138 Self::$name(a) => {
139 let scraper = <$package::$name as ScrapeSourceDef>::Scraper::default();
140 scraper.extract_core(&config.$package, a)
141 }
142 )*
143 }
144 }
145
146 $(
147 pub fn $package(&self) -> Option<&GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>> {
149 match self {
150 Self::$name(a) => Some(&a),
151 _ => None,
152 }
153 }
154 )*
155 }
156
157 impl std::ops::Deref for TypedScrape {
158 type Target = ScrapeShared;
159 fn deref(&self) -> &Self::Target {
160 match self {
161 $( Self::$name(a) => &a.shared, )*
162 }
163 }
164 }
165
166 impl std::ops::DerefMut for TypedScrape {
167 fn deref_mut(&mut self) -> &mut Self::Target {
168 match self {
169 $( Self::$name(a) => &mut a.shared, )*
170 }
171 }
172 }
173
174 $(
175 impl From<GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>> for TypedScrape {
176 fn from(x: GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>) -> Self {
177 TypedScrape::$name(x)
178 }
179 }
180 )*
181
182 #[derive(Debug, Eq, PartialEq)]
184 pub struct TypedScrapeMap<V> {
185 $( pub $package: V, )*
186 pub other: V
187 }
188
189 impl <V: Default> TypedScrapeMap<V> {
190 pub fn new() -> Self {
191 Self {
192 $( $package: Default::default(), )*
193 other: Default::default(),
194 }
195 }
196 }
197
198 impl <V: Copy> TypedScrapeMap<V> {
199 pub fn new_with_all(v: V) -> Self {
200 Self {
201 $( $package: v, )*
202 other: v,
203 }
204 }
205 }
206
207 impl <V: Default> Default for TypedScrapeMap<V> {
208 fn default() -> Self {
209 Self::new()
210 }
211 }
212
213 impl <V: Clone> Clone for TypedScrapeMap<V> {
214 fn clone(&self) -> Self {
215 Self {
216 $( $package: self.$package.clone(), )*
217 other: self.other.clone(),
218 }
219 }
220 }
221
222 impl <V> TypedScrapeMap<V> {
223 pub fn get(&self, source: ScrapeSource) -> &V {
225 match (source) {
226 $( ScrapeSource::$name => &self.$package, )*
227 ScrapeSource::Other => &self.other,
228 }
229 }
230
231 pub fn set(&mut self, source: ScrapeSource, mut value: V) -> V {
233 match (source) {
234 $( ScrapeSource::$name => std::mem::swap(&mut value, &mut self.$package), )*
235 ScrapeSource::Other => std::mem::swap(&mut value, &mut self.other),
236 }
237 value
238 }
239
240 pub fn remove(&mut self, source: ScrapeSource) -> V where V: Default {
243 self.set(source, V::default())
244 }
245
246 pub fn values(&self) -> impl Iterator<Item = &'_ V> {
248 [$( &self.$package, )* &self.other ].into_iter()
249 }
250
251 pub fn iter(&self) -> impl Iterator<Item = (ScrapeSource, &'_ V)> {
253 [$( (ScrapeSource::$name, &self.$package), )* (ScrapeSource::Other, &self.other) ].into_iter()
254 }
255
256 pub fn into_with_map<T>(self, f: impl Fn(ScrapeSource, V) -> T) -> TypedScrapeMap<T> {
257 TypedScrapeMap {
258 $( $package: f(ScrapeSource::$name, self.$package), )*
259 other: f(ScrapeSource::Other, self.other),
260 }
261 }
262
263 pub fn into_with_map_fallible<T, E>(self, f: impl Fn(ScrapeSource, V) -> Result<T, E>) -> Result<TypedScrapeMap<T>, E> {
264 Ok(TypedScrapeMap {
265 $( $package: f(ScrapeSource::$name, self.$package)?, )*
266 other: f(ScrapeSource::Other, self.other)?,
267 })
268 }
269 }
270
271 const fn one(_: &'static str) -> usize {
272 1
273 }
274
275 impl <V> IntoIterator for TypedScrapeMap<V> {
276 type Item = V;
277 type IntoIter = <[V; 1 $( + one(stringify!($package)) )* ] as IntoIterator>::IntoIter;
278
279 fn into_iter(self) -> Self::IntoIter {
280 [$(self.$package,)* self.other].into_iter()
281 }
282 }
283
284 impl <V: Serialize> Serialize for TypedScrapeMap<V> {
285 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
286 where
287 S: serde::Serializer {
288 let mut map = serializer.serialize_map(None)?;
289 $(
290 map.serialize_entry(stringify!($package), &self.$package)?;
291 )*
292 map.serialize_entry("other", &self.other)?;
293 map.end()
294 }
295 }
296
297 impl <'de, V: Default + Deserialize<'de>> Deserialize<'de> for TypedScrapeMap<V> {
299 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
300 where
301 D: serde::Deserializer<'de> {
302
303 #[derive(Deserialize)]
304 struct Temp<V> {
305 $( #[serde(default)] $package: V, )*
306 #[serde(default)] other: V,
307 }
308
309 let temp = Temp::deserialize(deserializer)?;
310 Ok(TypedScrapeMap::<V> {
311 $( $package: temp.$package, )*
312 other: temp.other,
313 })
314 }
315 }
316
317 };
318}
319
320impl From<TypedScrape> for (ScrapeId, TypedScrape) {
321 fn from(val: TypedScrape) -> Self {
322 (val.id.clone(), val)
323 }
324}
325
326impl Serialize for ScrapeSource {
327 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
328 where
329 S: serde::Serializer,
330 {
331 self.into_str().serialize(serializer)
332 }
333}
334
335impl<'de> Deserialize<'de> for ScrapeSource {
336 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
337 where
338 D: serde::Deserializer<'de>,
339 {
340 let s = String::deserialize(deserializer)?;
341 if let Some(source) = ScrapeSource::try_from_str(&s) {
342 Ok(source)
343 } else {
344 Err(serde::de::Error::custom("Invalid source"))
345 }
346 }
347}
348
349scrapers! {
350 hacker_news::HackerNews,
351 slashdot::Slashdot,
352 lobsters::Lobsters,
353 reddit::Reddit,
354}
355
356#[cfg(any(test, feature = "scrape_test"))]
357pub mod test {
358 use super::*;
359
360 macro_rules! stringify_all {
361 ( $($s:literal),* ) => {
362 vec![ $( include_str!( concat!("../../testdata/", $s ) ) ),* ]
363 };
364 }
365
366 fn slashdot_files() -> Vec<&'static str> {
367 stringify_all!["slashdot1.html", "slashdot2.html", "slashdot3.html"]
368 }
369
370 fn hacker_news_files() -> Vec<&'static str> {
371 stringify_all!["hn1.html", "hn2.html", "hn3.html", "hn4.html"]
372 }
373
374 fn lobsters_files() -> Vec<&'static str> {
375 stringify_all!["lobsters1.rss", "lobsters2.rss"]
376 }
377
378 fn reddit_files() -> Vec<&'static str> {
379 stringify_all![
380 "reddit-prog-tag1.json",
381 "reddit-prog-tag2.json",
382 "reddit-prog1.json",
383 "reddit-science1.json",
384 "reddit-science2.json"
385 ]
386 }
387
388 pub fn files_by_source(source: ScrapeSource) -> Vec<&'static str> {
389 match source {
390 ScrapeSource::HackerNews => hacker_news_files(),
391 ScrapeSource::Slashdot => slashdot_files(),
392 ScrapeSource::Reddit => reddit_files(),
393 ScrapeSource::Lobsters => lobsters_files(),
394 ScrapeSource::Other => vec![],
395 }
396 }
397
398 pub fn load_sample_scrapes(config: &ScrapeConfig) -> Vec<TypedScrape> {
400 let mut v = vec![];
401 for source in [
402 ScrapeSource::HackerNews,
403 ScrapeSource::Lobsters,
404 ScrapeSource::Reddit,
405 ScrapeSource::Slashdot,
406 ] {
407 for file in files_by_source(source) {
408 let mut res = scrape(config, source, file)
409 .unwrap_or_else(|_| panic!("Scrape of {:?} failed", source));
410 v.append(&mut res.0);
411 }
412 v.sort_by_key(|scrape| scrape.date);
413 }
414 v
415 }
416
417 #[test]
418 fn test_scrape_all() {
419 use crate::ScrapeExtractor;
420
421 let config = ScrapeConfig::default();
422 let extractor = ScrapeExtractor::new(&config);
423 for scrape in load_sample_scrapes(&config) {
424 let scrape = extractor.extract(&scrape);
425 assert!(
427 !scrape.title.contains("&")
428 && !scrape.title.contains(""")
429 && !scrape.title.contains("&squot")
430 );
431 assert!(!scrape.url.raw().contains("&"));
432 assert!(scrape.date.year() == 2023 || scrape.date.year() == 2022);
433 }
434 }
435}