progscrape_scrapers/backends/
mod.rs

1use serde::{Deserialize, Serialize, ser::SerializeMap};
2use std::{borrow::Cow, fmt::Debug};
3
4pub use self::def::ScrapeCore;
5pub(crate) use self::def::*;
6use crate::types::*;
7
8mod def;
9pub mod feed;
10pub mod hacker_news;
11pub mod legacy;
12pub mod lobsters;
13pub mod reddit;
14pub mod slashdot;
15mod utils;
16
17macro_rules! scrapers {
18    ($($package:ident :: $name:ident ,)*) => {
19        pub mod export {
20            $( pub use super::$package; )*
21        }
22
23        pub fn scrape(
24            config: &ScrapeConfig,
25            source: ScrapeSource,
26            input: &str,
27        ) -> Result<(Vec<TypedScrape>, Vec<String>), ScrapeError> {
28            match source {
29                $(
30                    ScrapeSource::$name => {
31                        let scraper = <$package::$name as ScrapeSourceDef>::Scraper::default();
32                        let (res, warnings) = scraper.scrape(&config.$package, input)?;
33                        Ok((res.into_iter().map(|x| x.into()).collect(), warnings))
34                    },
35                )*
36                ScrapeSource::Other => unreachable!(),
37            }
38        }
39
40        /// Configuration for all scrapers.
41        #[derive(Clone, Default, Serialize, Deserialize)]
42        pub struct ScrapeConfig {
43            $(
44                #[doc="Configuration for the "]
45                #[doc=stringify!($name)]
46                #[doc=" backend."]
47                pub $package : <$package :: $name as ScrapeSourceDef>::Config
48            ),*
49        }
50
51        impl ScrapeConfig {
52            pub fn get(&self, source: ScrapeSource) -> Option<&dyn ScrapeConfigSource> {
53                match source {
54                    $( ScrapeSource::$name => Some(&self.$package), )*
55                    ScrapeSource::Other => None,
56                }
57            }
58        }
59
60        #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)]
61        pub enum ScrapeSource {
62            $($name,)*
63            Other,
64        }
65
66        impl ScrapeSource {
67            pub fn into_str(&self) -> &'static str {
68                match self {
69                    $(Self::$name => stringify!($package),)*
70                    Self::Other => "other",
71                }
72            }
73
74            pub fn try_from_str(s: &str) -> Option<Self> {
75                match s {
76                    $(stringify!($package) => Some(Self::$name),)*
77                    "other" => Some(Self::Other),
78                    _ => None,
79                }
80            }
81
82            pub const fn all() -> &'static [ScrapeSource] {
83                &[$(Self::$name),*]
84            }
85
86            pub fn comments_url(&self, id: &str, subsource: Option<&str>) -> String {
87                match self {
88                    $(Self::$name => $package :: $name :: comments_url(id, subsource),)*
89                    _ => unimplemented!()
90                }
91            }
92
93            pub fn id_from_comments_url(&self, url: &str) -> Option<ScrapeId> {
94                match self {
95                    $(Self::$name => {
96                        let (source, subsource) = $package :: $name :: id_from_comments_url(url)?;
97                        Some(ScrapeId :: new( *self, subsource.map(|s| s.to_owned()), source.to_owned() ))
98                    },)*
99                    _ => unimplemented!()
100                }
101            }
102
103            pub fn is_comments_host(&self, host: &str) -> bool {
104                match self {
105                    $(Self::$name => $package :: $name :: is_comments_host(host),)*
106                    _ => unimplemented!()
107                }
108            }
109
110            pub fn id<'a, ID: Clone + Into<Cow<'a, str>>>(&self, id: ID) -> ScrapeId {
111                ScrapeId::new(*self, None, id.into().into())
112            }
113
114            pub fn subsource_id<'a, ID: Clone + Into<Cow<'a, str>>>(&self, subsource: ID, id: ID) -> ScrapeId {
115                ScrapeId::new(*self, Some(subsource.into().into()),  id.into().into())
116            }
117        }
118
119        #[derive(Clone, Debug, Deserialize, Serialize)]
120        pub enum TypedScrape {
121            $( $name (GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>), )*
122        }
123
124        impl TypedScrape {
125            pub fn merge(&mut self, b: Self) {
126                match (self, b) {
127                    $( (Self::$name(a), Self::$name(b)) => a.merge_generic(b), )*
128                    (_a, _b) => {
129                        // tracing::warn!(
130                        //     "Unable to merge incompatible scrapes, ignoring",
131                        // );
132                    }
133                }
134            }
135
136            pub(crate) fn extract(&self, config: &ScrapeConfig) -> ScrapeCore {
137                match self {
138                    $(
139                        Self::$name(a) => {
140                            let scraper = <$package::$name as ScrapeSourceDef>::Scraper::default();
141                            scraper.extract_core(&config.$package, a)
142                        }
143                    )*
144                }
145            }
146
147            $(
148            /// Attempt to coerce this `TypedScrape` into a `GenericScrape` of the given type.
149            pub fn $package(&self) -> Option<&GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>> {
150                match self {
151                    Self::$name(a) => Some(&a),
152                    _ => None,
153                }
154            }
155            )*
156        }
157
158        impl std::ops::Deref for TypedScrape {
159            type Target = ScrapeShared;
160            fn deref(&self) -> &Self::Target {
161                match self {
162                    $( Self::$name(a) => &a.shared, )*
163                }
164            }
165        }
166
167        impl std::ops::DerefMut for TypedScrape {
168            fn deref_mut(&mut self) -> &mut Self::Target {
169                match self {
170                    $( Self::$name(a) => &mut a.shared, )*
171                }
172            }
173        }
174
175        $(
176            impl From<GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>> for TypedScrape {
177                fn from(x: GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>) -> Self {
178                    TypedScrape::$name(x)
179                }
180            }
181        )*
182
183        /// A strongly-typed scrape map that can be used to collect values by scrape source without allocations.
184        #[derive(Debug, Eq, PartialEq)]
185        pub struct TypedScrapeMap<V> {
186            $( pub $package: V, )*
187            pub other: V
188        }
189
190        impl <V: Default> TypedScrapeMap<V> {
191            pub fn new() -> Self {
192                Self {
193                    $( $package: Default::default(), )*
194                    other: Default::default(),
195                }
196            }
197        }
198
199        impl <V: Copy> TypedScrapeMap<V> {
200            pub fn new_with_all(v: V) -> Self {
201                Self {
202                    $( $package: v, )*
203                    other: v,
204                }
205            }
206        }
207
208        impl <V: Default> Default for TypedScrapeMap<V> {
209            fn default() -> Self {
210                Self::new()
211            }
212        }
213
214        impl <V: Clone> Clone for TypedScrapeMap<V> {
215            fn clone(&self) -> Self {
216                Self {
217                    $( $package: self.$package.clone(), )*
218                    other: self.other.clone(),
219                }
220            }
221        }
222
223        impl <V> TypedScrapeMap<V> {
224            /// Get the given value based on a dynamic source.
225            pub fn get(&self, source: ScrapeSource) -> &V {
226                match (source) {
227                    $( ScrapeSource::$name => &self.$package, )*
228                    ScrapeSource::Other => &self.other,
229                }
230            }
231
232            /// Set the given value based on a dynamic source.
233            pub fn set(&mut self, source: ScrapeSource, mut value: V) -> V {
234                match (source) {
235                    $( ScrapeSource::$name => std::mem::swap(&mut value, &mut self.$package), )*
236                    ScrapeSource::Other => std::mem::swap(&mut value, &mut self.other),
237                }
238                value
239            }
240
241            /// Remove the given value based on a dynamic source, if values have
242            /// a default.
243            pub fn remove(&mut self, source: ScrapeSource) -> V where V: Default {
244                self.set(source, V::default())
245            }
246
247            /// Iterate over the underlying values.
248            pub fn values(&self) -> impl Iterator<Item = &'_ V> {
249                [$( &self.$package, )* &self.other ].into_iter()
250            }
251
252            /// Iterate over the underlying keys/values.
253            pub fn iter(&self) -> impl Iterator<Item = (ScrapeSource, &'_ V)> {
254                [$( (ScrapeSource::$name, &self.$package), )* (ScrapeSource::Other, &self.other) ].into_iter()
255            }
256
257            pub fn into_with_map<T>(self, f: impl Fn(ScrapeSource, V) -> T) -> TypedScrapeMap<T> {
258                TypedScrapeMap {
259                    $( $package: f(ScrapeSource::$name, self.$package), )*
260                    other: f(ScrapeSource::Other, self.other),
261                }
262            }
263
264            pub fn into_with_map_fallible<T, E>(self, f: impl Fn(ScrapeSource, V) -> Result<T, E>) -> Result<TypedScrapeMap<T>, E> {
265                Ok(TypedScrapeMap {
266                    $( $package: f(ScrapeSource::$name, self.$package)?, )*
267                    other: f(ScrapeSource::Other, self.other)?,
268                })
269            }
270        }
271
272        const fn one(_: &'static str) -> usize {
273            1
274        }
275
276        impl <V> IntoIterator for TypedScrapeMap<V> {
277            type Item = V;
278            type IntoIter = <[V; 1 $( + one(stringify!($package)) )* ] as IntoIterator>::IntoIter;
279
280            fn into_iter(self) -> Self::IntoIter {
281                [$(self.$package,)* self.other].into_iter()
282            }
283        }
284
285        impl <V: Serialize> Serialize for TypedScrapeMap<V> {
286            fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
287                where
288                    S: serde::Serializer {
289                let mut map = serializer.serialize_map(None)?;
290                $(
291                    map.serialize_entry(stringify!($package), &self.$package)?;
292                )*
293                map.serialize_entry("other", &self.other)?;
294                map.end()
295            }
296        }
297
298        /// Implement `Deserialize` if and only if `V` is `Default` as well.
299        impl <'de, V: Default + Deserialize<'de>> Deserialize<'de> for TypedScrapeMap<V> {
300            fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
301                where
302                    D: serde::Deserializer<'de> {
303
304                #[derive(Deserialize)]
305                struct Temp<V> {
306                    $( #[serde(default)] $package: V, )*
307                    #[serde(default)] other: V,
308                }
309
310                let temp = Temp::deserialize(deserializer)?;
311                Ok(TypedScrapeMap::<V> {
312                    $( $package: temp.$package, )*
313                    other: temp.other,
314                })
315            }
316        }
317
318    };
319}
320
321impl From<TypedScrape> for (ScrapeId, TypedScrape) {
322    fn from(val: TypedScrape) -> Self {
323        (val.id.clone(), val)
324    }
325}
326
327impl Serialize for ScrapeSource {
328    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
329    where
330        S: serde::Serializer,
331    {
332        self.into_str().serialize(serializer)
333    }
334}
335
336impl<'de> Deserialize<'de> for ScrapeSource {
337    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
338    where
339        D: serde::Deserializer<'de>,
340    {
341        let s = String::deserialize(deserializer)?;
342        if let Some(source) = ScrapeSource::try_from_str(&s) {
343            Ok(source)
344        } else {
345            Err(serde::de::Error::custom("Invalid source"))
346        }
347    }
348}
349
350scrapers! {
351    hacker_news::HackerNews,
352    slashdot::Slashdot,
353    lobsters::Lobsters,
354    reddit::Reddit,
355    feed::Feed,
356}
357
358#[cfg(any(test, feature = "scrape_test"))]
359pub mod test {
360    use super::*;
361
362    macro_rules! stringify_all {
363        ( $($s:literal),* ) => {
364            vec![ $( include_str!( concat!("../../testdata/", $s ) ) ),* ]
365        };
366    }
367
368    fn slashdot_files() -> Vec<&'static str> {
369        stringify_all!["slashdot1.html", "slashdot2.html", "slashdot3.html"]
370    }
371
372    fn hacker_news_files() -> Vec<&'static str> {
373        stringify_all![
374            "hn1.html", "hn2.html", "hn3.html", "hn4.html", "hn5.html", "hn6.html"
375        ]
376    }
377
378    fn lobsters_files() -> Vec<&'static str> {
379        stringify_all!["lobsters1.rss", "lobsters2.rss"]
380    }
381
382    fn reddit_files() -> Vec<&'static str> {
383        stringify_all![
384            "reddit-prog-tag1.json",
385            "reddit-prog-tag2.json",
386            "reddit-prog1.json",
387            "reddit-science1.json",
388            "reddit-science2.json"
389        ]
390    }
391
392    pub fn files_by_source(source: ScrapeSource) -> Vec<&'static str> {
393        match source {
394            ScrapeSource::HackerNews => hacker_news_files(),
395            ScrapeSource::Slashdot => slashdot_files(),
396            ScrapeSource::Reddit => reddit_files(),
397            ScrapeSource::Lobsters => lobsters_files(),
398            ScrapeSource::Feed => vec![],
399            ScrapeSource::Other => vec![],
400        }
401    }
402
403    /// Loads the various sample stories we've collected.
404    pub fn load_sample_scrapes(config: &ScrapeConfig) -> Vec<TypedScrape> {
405        let mut v = vec![];
406        for source in [
407            ScrapeSource::HackerNews,
408            ScrapeSource::Lobsters,
409            ScrapeSource::Reddit,
410            ScrapeSource::Slashdot,
411        ] {
412            for file in files_by_source(source) {
413                let mut res = scrape(config, source, file)
414                    .unwrap_or_else(|_| panic!("Scrape of {source:?} failed"));
415                if res.0.is_empty() {
416                    panic!("Failed to scrape anything! {file} {:?}", res.1);
417                }
418                v.append(&mut res.0);
419            }
420            v.sort_by_key(|scrape| scrape.date);
421        }
422        v
423    }
424
425    #[test]
426    fn test_scrape_all() {
427        use crate::ScrapeExtractor;
428
429        let config = ScrapeConfig::default();
430        let extractor = ScrapeExtractor::new(&config);
431        for scrape in load_sample_scrapes(&config) {
432            let scrape = extractor.extract(&scrape);
433            // Sanity check the scrapes
434            assert!(
435                !scrape.title.contains("&amp")
436                    && !scrape.title.contains("&quot")
437                    && !scrape.title.contains("&squot")
438            );
439            assert!(!scrape.url.raw().contains("&amp"));
440            assert!(scrape.date.year() >= 2022 && scrape.date.year() <= 2024);
441        }
442    }
443}