progscrape_scrapers/backends/
mod.rs

1use serde::{ser::SerializeMap, Deserialize, Serialize};
2use std::{borrow::Cow, fmt::Debug};
3
4pub use self::def::ScrapeCore;
5pub(crate) use self::def::*;
6use crate::types::*;
7
8mod def;
9pub mod hacker_news;
10pub mod legacy;
11pub mod lobsters;
12pub mod reddit;
13pub mod slashdot;
14mod utils;
15
16macro_rules! scrapers {
17    ($($package:ident :: $name:ident ,)*) => {
18        pub mod export {
19            $( pub use super::$package; )*
20        }
21
22        pub fn scrape(
23            config: &ScrapeConfig,
24            source: ScrapeSource,
25            input: &str,
26        ) -> Result<(Vec<TypedScrape>, Vec<String>), ScrapeError> {
27            match source {
28                $(
29                    ScrapeSource::$name => {
30                        let scraper = <$package::$name as ScrapeSourceDef>::Scraper::default();
31                        let (res, warnings) = scraper.scrape(&config.$package, input)?;
32                        Ok((res.into_iter().map(|x| x.into()).collect(), warnings))
33                    },
34                )*
35                ScrapeSource::Other => unreachable!(),
36            }
37        }
38
39        /// Configuration for all scrapers.
40        #[derive(Clone, Default, Serialize, Deserialize)]
41        pub struct ScrapeConfig {
42            $(
43                #[doc="Configuration for the "]
44                #[doc=stringify!($name)]
45                #[doc=" backend."]
46                pub $package : <$package :: $name as ScrapeSourceDef>::Config
47            ),*
48        }
49
50        impl ScrapeConfig {
51            pub fn get(&self, source: ScrapeSource) -> Option<&dyn ScrapeConfigSource> {
52                match source {
53                    $( ScrapeSource::$name => Some(&self.$package), )*
54                    ScrapeSource::Other => None,
55                }
56            }
57        }
58
59        #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)]
60        pub enum ScrapeSource {
61            $($name,)*
62            Other,
63        }
64
65        impl ScrapeSource {
66            pub fn into_str(&self) -> &'static str {
67                match self {
68                    $(Self::$name => stringify!($package),)*
69                    Self::Other => "other",
70                }
71            }
72
73            pub fn try_from_str(s: &str) -> Option<Self> {
74                match s {
75                    $(stringify!($package) => Some(Self::$name),)*
76                    "other" => Some(Self::Other),
77                    _ => None,
78                }
79            }
80
81            pub const fn all() -> &'static [ScrapeSource] {
82                &[$(Self::$name),*]
83            }
84
85            pub fn comments_url(&self, id: &str, subsource: Option<&str>) -> String {
86                match self {
87                    $(Self::$name => $package :: $name :: comments_url(id, subsource),)*
88                    _ => unimplemented!()
89                }
90            }
91
92            pub fn id_from_comments_url<'a, 'b>(&'a self, url: &'b str) -> Option<ScrapeId> {
93                match self {
94                    $(Self::$name => {
95                        let (source, subsource) = $package :: $name :: id_from_comments_url(url)?;
96                        Some(ScrapeId :: new( *self, subsource.map(|s| s.to_owned()), source.to_owned() ))
97                    },)*
98                    _ => unimplemented!()
99                }
100            }
101
102            pub fn is_comments_host(&self, host: &str) -> bool {
103                match self {
104                    $(Self::$name => $package :: $name :: is_comments_host(host),)*
105                    _ => unimplemented!()
106                }
107            }
108
109            pub fn id<'a, ID: Clone + Into<Cow<'a, str>>>(&self, id: ID) -> ScrapeId {
110                ScrapeId::new(*self, None, id.into().into())
111            }
112
113            pub fn subsource_id<'a, ID: Clone + Into<Cow<'a, str>>>(&self, subsource: ID, id: ID) -> ScrapeId {
114                ScrapeId::new(*self, Some(subsource.into().into()),  id.into().into())
115            }
116        }
117
118        #[derive(Clone, Debug, Deserialize, Serialize)]
119        pub enum TypedScrape {
120            $( $name (GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>), )*
121        }
122
123        impl TypedScrape {
124            pub fn merge(&mut self, b: Self) {
125                match (self, b) {
126                    $( (Self::$name(a), Self::$name(b)) => a.merge_generic(b), )*
127                    (_a, _b) => {
128                        // tracing::warn!(
129                        //     "Unable to merge incompatible scrapes, ignoring",
130                        // );
131                    }
132                }
133            }
134
135            pub(crate) fn extract(&self, config: &ScrapeConfig) -> ScrapeCore {
136                match self {
137                    $(
138                        Self::$name(a) => {
139                            let scraper = <$package::$name as ScrapeSourceDef>::Scraper::default();
140                            scraper.extract_core(&config.$package, a)
141                        }
142                    )*
143                }
144            }
145
146            $(
147            /// Attempt to coerce this `TypedScrape` into a `GenericScrape` of the given type.
148            pub fn $package(&self) -> Option<&GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>> {
149                match self {
150                    Self::$name(a) => Some(&a),
151                    _ => None,
152                }
153            }
154            )*
155        }
156
157        impl std::ops::Deref for TypedScrape {
158            type Target = ScrapeShared;
159            fn deref(&self) -> &Self::Target {
160                match self {
161                    $( Self::$name(a) => &a.shared, )*
162                }
163            }
164        }
165
166        impl std::ops::DerefMut for TypedScrape {
167            fn deref_mut(&mut self) -> &mut Self::Target {
168                match self {
169                    $( Self::$name(a) => &mut a.shared, )*
170                }
171            }
172        }
173
174        $(
175            impl From<GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>> for TypedScrape {
176                fn from(x: GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>) -> Self {
177                    TypedScrape::$name(x)
178                }
179            }
180        )*
181
182        /// A strongly-typed scrape map that can be used to collect values by scrape source without allocations.
183        #[derive(Debug, Eq, PartialEq)]
184        pub struct TypedScrapeMap<V> {
185            $( pub $package: V, )*
186            pub other: V
187        }
188
189        impl <V: Default> TypedScrapeMap<V> {
190            pub fn new() -> Self {
191                Self {
192                    $( $package: Default::default(), )*
193                    other: Default::default(),
194                }
195            }
196        }
197
198        impl <V: Copy> TypedScrapeMap<V> {
199            pub fn new_with_all(v: V) -> Self {
200                Self {
201                    $( $package: v, )*
202                    other: v,
203                }
204            }
205        }
206
207        impl <V: Default> Default for TypedScrapeMap<V> {
208            fn default() -> Self {
209                Self::new()
210            }
211        }
212
213        impl <V: Clone> Clone for TypedScrapeMap<V> {
214            fn clone(&self) -> Self {
215                Self {
216                    $( $package: self.$package.clone(), )*
217                    other: self.other.clone(),
218                }
219            }
220        }
221
222        impl <V> TypedScrapeMap<V> {
223            /// Get the given value based on a dynamic source.
224            pub fn get(&self, source: ScrapeSource) -> &V {
225                match (source) {
226                    $( ScrapeSource::$name => &self.$package, )*
227                    ScrapeSource::Other => &self.other,
228                }
229            }
230
231            /// Set the given value based on a dynamic source.
232            pub fn set(&mut self, source: ScrapeSource, mut value: V) -> V {
233                match (source) {
234                    $( ScrapeSource::$name => std::mem::swap(&mut value, &mut self.$package), )*
235                    ScrapeSource::Other => std::mem::swap(&mut value, &mut self.other),
236                }
237                value
238            }
239
240            /// Remove the given value based on a dynamic source, if values have
241            /// a default.
242            pub fn remove(&mut self, source: ScrapeSource) -> V where V: Default {
243                self.set(source, V::default())
244            }
245
246            /// Iterate over the underlying values.
247            pub fn values(&self) -> impl Iterator<Item = &'_ V> {
248                [$( &self.$package, )* &self.other ].into_iter()
249            }
250
251            /// Iterate over the underlying keys/values.
252            pub fn iter(&self) -> impl Iterator<Item = (ScrapeSource, &'_ V)> {
253                [$( (ScrapeSource::$name, &self.$package), )* (ScrapeSource::Other, &self.other) ].into_iter()
254            }
255
256            pub fn into_with_map<T>(self, f: impl Fn(ScrapeSource, V) -> T) -> TypedScrapeMap<T> {
257                TypedScrapeMap {
258                    $( $package: f(ScrapeSource::$name, self.$package), )*
259                    other: f(ScrapeSource::Other, self.other),
260                }
261            }
262
263            pub fn into_with_map_fallible<T, E>(self, f: impl Fn(ScrapeSource, V) -> Result<T, E>) -> Result<TypedScrapeMap<T>, E> {
264                Ok(TypedScrapeMap {
265                    $( $package: f(ScrapeSource::$name, self.$package)?, )*
266                    other: f(ScrapeSource::Other, self.other)?,
267                })
268            }
269        }
270
271        const fn one(_: &'static str) -> usize {
272            1
273        }
274
275        impl <V> IntoIterator for TypedScrapeMap<V> {
276            type Item = V;
277            type IntoIter = <[V; 1 $( + one(stringify!($package)) )* ] as IntoIterator>::IntoIter;
278
279            fn into_iter(self) -> Self::IntoIter {
280                [$(self.$package,)* self.other].into_iter()
281            }
282        }
283
284        impl <V: Serialize> Serialize for TypedScrapeMap<V> {
285            fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
286                where
287                    S: serde::Serializer {
288                let mut map = serializer.serialize_map(None)?;
289                $(
290                    map.serialize_entry(stringify!($package), &self.$package)?;
291                )*
292                map.serialize_entry("other", &self.other)?;
293                map.end()
294            }
295        }
296
297        /// Implement `Deserialize` if and only if `V` is `Default` as well.
298        impl <'de, V: Default + Deserialize<'de>> Deserialize<'de> for TypedScrapeMap<V> {
299            fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
300                where
301                    D: serde::Deserializer<'de> {
302
303                #[derive(Deserialize)]
304                struct Temp<V> {
305                    $( #[serde(default)] $package: V, )*
306                    #[serde(default)] other: V,
307                }
308
309                let temp = Temp::deserialize(deserializer)?;
310                Ok(TypedScrapeMap::<V> {
311                    $( $package: temp.$package, )*
312                    other: temp.other,
313                })
314            }
315        }
316
317    };
318}
319
320impl From<TypedScrape> for (ScrapeId, TypedScrape) {
321    fn from(val: TypedScrape) -> Self {
322        (val.id.clone(), val)
323    }
324}
325
326impl Serialize for ScrapeSource {
327    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
328    where
329        S: serde::Serializer,
330    {
331        self.into_str().serialize(serializer)
332    }
333}
334
335impl<'de> Deserialize<'de> for ScrapeSource {
336    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
337    where
338        D: serde::Deserializer<'de>,
339    {
340        let s = String::deserialize(deserializer)?;
341        if let Some(source) = ScrapeSource::try_from_str(&s) {
342            Ok(source)
343        } else {
344            Err(serde::de::Error::custom("Invalid source"))
345        }
346    }
347}
348
349scrapers! {
350    hacker_news::HackerNews,
351    slashdot::Slashdot,
352    lobsters::Lobsters,
353    reddit::Reddit,
354}
355
356#[cfg(any(test, feature = "scrape_test"))]
357pub mod test {
358    use super::*;
359
360    macro_rules! stringify_all {
361        ( $($s:literal),* ) => {
362            vec![ $( include_str!( concat!("../../testdata/", $s ) ) ),* ]
363        };
364    }
365
366    fn slashdot_files() -> Vec<&'static str> {
367        stringify_all!["slashdot1.html", "slashdot2.html", "slashdot3.html"]
368    }
369
370    fn hacker_news_files() -> Vec<&'static str> {
371        stringify_all!["hn1.html", "hn2.html", "hn3.html", "hn4.html"]
372    }
373
374    fn lobsters_files() -> Vec<&'static str> {
375        stringify_all!["lobsters1.rss", "lobsters2.rss"]
376    }
377
378    fn reddit_files() -> Vec<&'static str> {
379        stringify_all![
380            "reddit-prog-tag1.json",
381            "reddit-prog-tag2.json",
382            "reddit-prog1.json",
383            "reddit-science1.json",
384            "reddit-science2.json"
385        ]
386    }
387
388    pub fn files_by_source(source: ScrapeSource) -> Vec<&'static str> {
389        match source {
390            ScrapeSource::HackerNews => hacker_news_files(),
391            ScrapeSource::Slashdot => slashdot_files(),
392            ScrapeSource::Reddit => reddit_files(),
393            ScrapeSource::Lobsters => lobsters_files(),
394            ScrapeSource::Other => vec![],
395        }
396    }
397
398    /// Loads the various sample stories we've collected.
399    pub fn load_sample_scrapes(config: &ScrapeConfig) -> Vec<TypedScrape> {
400        let mut v = vec![];
401        for source in [
402            ScrapeSource::HackerNews,
403            ScrapeSource::Lobsters,
404            ScrapeSource::Reddit,
405            ScrapeSource::Slashdot,
406        ] {
407            for file in files_by_source(source) {
408                let mut res = scrape(config, source, file)
409                    .unwrap_or_else(|_| panic!("Scrape of {:?} failed", source));
410                v.append(&mut res.0);
411            }
412            v.sort_by_key(|scrape| scrape.date);
413        }
414        v
415    }
416
417    #[test]
418    fn test_scrape_all() {
419        use crate::ScrapeExtractor;
420
421        let config = ScrapeConfig::default();
422        let extractor = ScrapeExtractor::new(&config);
423        for scrape in load_sample_scrapes(&config) {
424            let scrape = extractor.extract(&scrape);
425            // Sanity check the scrapes
426            assert!(
427                !scrape.title.contains("&amp")
428                    && !scrape.title.contains("&quot")
429                    && !scrape.title.contains("&squot")
430            );
431            assert!(!scrape.url.raw().contains("&amp"));
432            assert!(scrape.date.year() == 2023 || scrape.date.year() == 2022);
433        }
434    }
435}