1use serde::{Deserialize, Serialize, ser::SerializeMap};
2use std::{borrow::Cow, fmt::Debug};
3
4pub use self::def::ScrapeCore;
5pub(crate) use self::def::*;
6use crate::types::*;
7
8mod def;
9pub mod feed;
10pub mod hacker_news;
11pub mod legacy;
12pub mod lobsters;
13pub mod reddit;
14pub mod slashdot;
15mod utils;
16
17macro_rules! scrapers {
18 ($($package:ident :: $name:ident ,)*) => {
19 pub mod export {
20 $( pub use super::$package; )*
21 }
22
23 pub fn scrape(
24 config: &ScrapeConfig,
25 source: ScrapeSource,
26 input: &str,
27 ) -> Result<(Vec<TypedScrape>, Vec<String>), ScrapeError> {
28 match source {
29 $(
30 ScrapeSource::$name => {
31 let scraper = <$package::$name as ScrapeSourceDef>::Scraper::default();
32 let (res, warnings) = scraper.scrape(&config.$package, input)?;
33 Ok((res.into_iter().map(|x| x.into()).collect(), warnings))
34 },
35 )*
36 ScrapeSource::Other => unreachable!(),
37 }
38 }
39
40 #[derive(Clone, Default, Serialize, Deserialize)]
42 pub struct ScrapeConfig {
43 $(
44 #[doc="Configuration for the "]
45 #[doc=stringify!($name)]
46 #[doc=" backend."]
47 pub $package : <$package :: $name as ScrapeSourceDef>::Config
48 ),*
49 }
50
51 impl ScrapeConfig {
52 pub fn get(&self, source: ScrapeSource) -> Option<&dyn ScrapeConfigSource> {
53 match source {
54 $( ScrapeSource::$name => Some(&self.$package), )*
55 ScrapeSource::Other => None,
56 }
57 }
58 }
59
60 #[derive(Clone, Copy, Debug, Eq, PartialEq, Hash, PartialOrd, Ord)]
61 pub enum ScrapeSource {
62 $($name,)*
63 Other,
64 }
65
66 impl ScrapeSource {
67 pub fn into_str(&self) -> &'static str {
68 match self {
69 $(Self::$name => stringify!($package),)*
70 Self::Other => "other",
71 }
72 }
73
74 pub fn try_from_str(s: &str) -> Option<Self> {
75 match s {
76 $(stringify!($package) => Some(Self::$name),)*
77 "other" => Some(Self::Other),
78 _ => None,
79 }
80 }
81
82 pub const fn all() -> &'static [ScrapeSource] {
83 &[$(Self::$name),*]
84 }
85
86 pub fn comments_url(&self, id: &str, subsource: Option<&str>) -> String {
87 match self {
88 $(Self::$name => $package :: $name :: comments_url(id, subsource),)*
89 _ => unimplemented!()
90 }
91 }
92
93 pub fn id_from_comments_url(&self, url: &str) -> Option<ScrapeId> {
94 match self {
95 $(Self::$name => {
96 let (source, subsource) = $package :: $name :: id_from_comments_url(url)?;
97 Some(ScrapeId :: new( *self, subsource.map(|s| s.to_owned()), source.to_owned() ))
98 },)*
99 _ => unimplemented!()
100 }
101 }
102
103 pub fn is_comments_host(&self, host: &str) -> bool {
104 match self {
105 $(Self::$name => $package :: $name :: is_comments_host(host),)*
106 _ => unimplemented!()
107 }
108 }
109
110 pub fn id<'a, ID: Clone + Into<Cow<'a, str>>>(&self, id: ID) -> ScrapeId {
111 ScrapeId::new(*self, None, id.into().into())
112 }
113
114 pub fn subsource_id<'a, ID: Clone + Into<Cow<'a, str>>>(&self, subsource: ID, id: ID) -> ScrapeId {
115 ScrapeId::new(*self, Some(subsource.into().into()), id.into().into())
116 }
117 }
118
119 #[derive(Clone, Debug, Deserialize, Serialize)]
120 pub enum TypedScrape {
121 $( $name (GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>), )*
122 }
123
124 impl TypedScrape {
125 pub fn merge(&mut self, b: Self) {
126 match (self, b) {
127 $( (Self::$name(a), Self::$name(b)) => a.merge_generic(b), )*
128 (_a, _b) => {
129 }
133 }
134 }
135
136 pub(crate) fn extract(&self, config: &ScrapeConfig) -> ScrapeCore {
137 match self {
138 $(
139 Self::$name(a) => {
140 let scraper = <$package::$name as ScrapeSourceDef>::Scraper::default();
141 scraper.extract_core(&config.$package, a)
142 }
143 )*
144 }
145 }
146
147 $(
148 pub fn $package(&self) -> Option<&GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>> {
150 match self {
151 Self::$name(a) => Some(&a),
152 _ => None,
153 }
154 }
155 )*
156 }
157
158 impl std::ops::Deref for TypedScrape {
159 type Target = ScrapeShared;
160 fn deref(&self) -> &Self::Target {
161 match self {
162 $( Self::$name(a) => &a.shared, )*
163 }
164 }
165 }
166
167 impl std::ops::DerefMut for TypedScrape {
168 fn deref_mut(&mut self) -> &mut Self::Target {
169 match self {
170 $( Self::$name(a) => &mut a.shared, )*
171 }
172 }
173 }
174
175 $(
176 impl From<GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>> for TypedScrape {
177 fn from(x: GenericScrape<<$package::$name as ScrapeSourceDef>::Scrape>) -> Self {
178 TypedScrape::$name(x)
179 }
180 }
181 )*
182
183 #[derive(Debug, Eq, PartialEq)]
185 pub struct TypedScrapeMap<V> {
186 $( pub $package: V, )*
187 pub other: V
188 }
189
190 impl <V: Default> TypedScrapeMap<V> {
191 pub fn new() -> Self {
192 Self {
193 $( $package: Default::default(), )*
194 other: Default::default(),
195 }
196 }
197 }
198
199 impl <V: Copy> TypedScrapeMap<V> {
200 pub fn new_with_all(v: V) -> Self {
201 Self {
202 $( $package: v, )*
203 other: v,
204 }
205 }
206 }
207
208 impl <V: Default> Default for TypedScrapeMap<V> {
209 fn default() -> Self {
210 Self::new()
211 }
212 }
213
214 impl <V: Clone> Clone for TypedScrapeMap<V> {
215 fn clone(&self) -> Self {
216 Self {
217 $( $package: self.$package.clone(), )*
218 other: self.other.clone(),
219 }
220 }
221 }
222
223 impl <V> TypedScrapeMap<V> {
224 pub fn get(&self, source: ScrapeSource) -> &V {
226 match (source) {
227 $( ScrapeSource::$name => &self.$package, )*
228 ScrapeSource::Other => &self.other,
229 }
230 }
231
232 pub fn set(&mut self, source: ScrapeSource, mut value: V) -> V {
234 match (source) {
235 $( ScrapeSource::$name => std::mem::swap(&mut value, &mut self.$package), )*
236 ScrapeSource::Other => std::mem::swap(&mut value, &mut self.other),
237 }
238 value
239 }
240
241 pub fn remove(&mut self, source: ScrapeSource) -> V where V: Default {
244 self.set(source, V::default())
245 }
246
247 pub fn values(&self) -> impl Iterator<Item = &'_ V> {
249 [$( &self.$package, )* &self.other ].into_iter()
250 }
251
252 pub fn iter(&self) -> impl Iterator<Item = (ScrapeSource, &'_ V)> {
254 [$( (ScrapeSource::$name, &self.$package), )* (ScrapeSource::Other, &self.other) ].into_iter()
255 }
256
257 pub fn into_with_map<T>(self, f: impl Fn(ScrapeSource, V) -> T) -> TypedScrapeMap<T> {
258 TypedScrapeMap {
259 $( $package: f(ScrapeSource::$name, self.$package), )*
260 other: f(ScrapeSource::Other, self.other),
261 }
262 }
263
264 pub fn into_with_map_fallible<T, E>(self, f: impl Fn(ScrapeSource, V) -> Result<T, E>) -> Result<TypedScrapeMap<T>, E> {
265 Ok(TypedScrapeMap {
266 $( $package: f(ScrapeSource::$name, self.$package)?, )*
267 other: f(ScrapeSource::Other, self.other)?,
268 })
269 }
270 }
271
272 const fn one(_: &'static str) -> usize {
273 1
274 }
275
276 impl <V> IntoIterator for TypedScrapeMap<V> {
277 type Item = V;
278 type IntoIter = <[V; 1 $( + one(stringify!($package)) )* ] as IntoIterator>::IntoIter;
279
280 fn into_iter(self) -> Self::IntoIter {
281 [$(self.$package,)* self.other].into_iter()
282 }
283 }
284
285 impl <V: Serialize> Serialize for TypedScrapeMap<V> {
286 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
287 where
288 S: serde::Serializer {
289 let mut map = serializer.serialize_map(None)?;
290 $(
291 map.serialize_entry(stringify!($package), &self.$package)?;
292 )*
293 map.serialize_entry("other", &self.other)?;
294 map.end()
295 }
296 }
297
298 impl <'de, V: Default + Deserialize<'de>> Deserialize<'de> for TypedScrapeMap<V> {
300 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
301 where
302 D: serde::Deserializer<'de> {
303
304 #[derive(Deserialize)]
305 struct Temp<V> {
306 $( #[serde(default)] $package: V, )*
307 #[serde(default)] other: V,
308 }
309
310 let temp = Temp::deserialize(deserializer)?;
311 Ok(TypedScrapeMap::<V> {
312 $( $package: temp.$package, )*
313 other: temp.other,
314 })
315 }
316 }
317
318 };
319}
320
321impl From<TypedScrape> for (ScrapeId, TypedScrape) {
322 fn from(val: TypedScrape) -> Self {
323 (val.id.clone(), val)
324 }
325}
326
327impl Serialize for ScrapeSource {
328 fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
329 where
330 S: serde::Serializer,
331 {
332 self.into_str().serialize(serializer)
333 }
334}
335
336impl<'de> Deserialize<'de> for ScrapeSource {
337 fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
338 where
339 D: serde::Deserializer<'de>,
340 {
341 let s = String::deserialize(deserializer)?;
342 if let Some(source) = ScrapeSource::try_from_str(&s) {
343 Ok(source)
344 } else {
345 Err(serde::de::Error::custom("Invalid source"))
346 }
347 }
348}
349
350scrapers! {
351 hacker_news::HackerNews,
352 slashdot::Slashdot,
353 lobsters::Lobsters,
354 reddit::Reddit,
355 feed::Feed,
356}
357
358#[cfg(any(test, feature = "scrape_test"))]
359pub mod test {
360 use super::*;
361
362 macro_rules! stringify_all {
363 ( $($s:literal),* ) => {
364 vec![ $( include_str!( concat!("../../testdata/", $s ) ) ),* ]
365 };
366 }
367
368 fn slashdot_files() -> Vec<&'static str> {
369 stringify_all!["slashdot1.html", "slashdot2.html", "slashdot3.html"]
370 }
371
372 fn hacker_news_files() -> Vec<&'static str> {
373 stringify_all![
374 "hn1.html", "hn2.html", "hn3.html", "hn4.html", "hn5.html", "hn6.html"
375 ]
376 }
377
378 fn lobsters_files() -> Vec<&'static str> {
379 stringify_all!["lobsters1.rss", "lobsters2.rss"]
380 }
381
382 fn reddit_files() -> Vec<&'static str> {
383 stringify_all![
384 "reddit-prog-tag1.json",
385 "reddit-prog-tag2.json",
386 "reddit-prog1.json",
387 "reddit-science1.json",
388 "reddit-science2.json"
389 ]
390 }
391
392 pub fn files_by_source(source: ScrapeSource) -> Vec<&'static str> {
393 match source {
394 ScrapeSource::HackerNews => hacker_news_files(),
395 ScrapeSource::Slashdot => slashdot_files(),
396 ScrapeSource::Reddit => reddit_files(),
397 ScrapeSource::Lobsters => lobsters_files(),
398 ScrapeSource::Feed => vec![],
399 ScrapeSource::Other => vec![],
400 }
401 }
402
403 pub fn load_sample_scrapes(config: &ScrapeConfig) -> Vec<TypedScrape> {
405 let mut v = vec![];
406 for source in [
407 ScrapeSource::HackerNews,
408 ScrapeSource::Lobsters,
409 ScrapeSource::Reddit,
410 ScrapeSource::Slashdot,
411 ] {
412 for file in files_by_source(source) {
413 let mut res = scrape(config, source, file)
414 .unwrap_or_else(|_| panic!("Scrape of {source:?} failed"));
415 if res.0.is_empty() {
416 panic!("Failed to scrape anything! {file} {:?}", res.1);
417 }
418 v.append(&mut res.0);
419 }
420 v.sort_by_key(|scrape| scrape.date);
421 }
422 v
423 }
424
425 #[test]
426 fn test_scrape_all() {
427 use crate::ScrapeExtractor;
428
429 let config = ScrapeConfig::default();
430 let extractor = ScrapeExtractor::new(&config);
431 for scrape in load_sample_scrapes(&config) {
432 let scrape = extractor.extract(&scrape);
433 assert!(
435 !scrape.title.contains("&")
436 && !scrape.title.contains(""")
437 && !scrape.title.contains("&squot")
438 );
439 assert!(!scrape.url.raw().contains("&"));
440 assert!(scrape.date.year() >= 2022 && scrape.date.year() <= 2024);
441 }
442 }
443}