progscrape_scrapers/backends/
def.rs

1use crate::ScrapeId;
2
3use super::*;
4
5/// Our scrape sources, and the associated data types for each.
6pub trait ScrapeSourceDef {
7    type Config: ScrapeConfigSource;
8    type Scrape: ScrapeStory;
9    type Scraper: Scraper<Config = Self::Config, Output = Self::Scrape>;
10
11    fn comments_url(id: &str, subsource: Option<&str>) -> String;
12    fn id_from_comments_url(url: &str) -> Option<(&str, Option<&str>)>;
13    fn is_comments_host(host: &str) -> bool;
14}
15
16pub trait ScrapeStory {
17    const TYPE: ScrapeSource;
18
19    fn merge(&mut self, other: Self);
20}
21
22pub trait Scraper: Default {
23    type Config: ScrapeConfigSource;
24    type Output: ScrapeStory;
25
26    /// Given input in the correct format, scrapes raw stories.
27    fn scrape(
28        &self,
29        args: &Self::Config,
30        input: &str,
31    ) -> Result<(Vec<GenericScrape<Self::Output>>, Vec<String>), ScrapeError>;
32
33    /// Extract the core scrape elements from the raw scrape.
34    fn extract_core<'a>(
35        &self,
36        args: &Self::Config,
37        input: &'a GenericScrape<Self::Output>,
38    ) -> ScrapeCore<'a>;
39}
40
41pub trait ScrapeConfigSource {
42    fn subsources(&self) -> Vec<String>;
43    fn provide_urls(&self, subsources: Vec<String>) -> Vec<String>;
44}
45
46#[derive(Clone, Debug)]
47pub struct ScrapeCore<'a> {
48    /// The scrape source ID.
49    pub source: &'a ScrapeId,
50
51    /// Story title from this scrape source, potentially edited based on source (stripping suffixes, etc).
52    pub title: Cow<'a, str>,
53
54    /// Story URL.
55    pub url: &'a StoryUrl,
56
57    /// Story date/time.
58    pub date: StoryDate,
59
60    /// Story tags from scrape source.
61    pub tags: Vec<Cow<'a, str>>,
62
63    /// If this story has a rank, lower is better.
64    pub rank: Option<usize>,
65}
66
67#[derive(Clone, Debug, Serialize, Deserialize)]
68pub struct ScrapeShared {
69    pub id: ScrapeId,
70    pub url: StoryUrl,
71    pub raw_title: String,
72    pub date: StoryDate,
73}
74
75#[derive(Clone, Debug, Serialize, Deserialize)]
76pub struct GenericScrape<T: ScrapeStory> {
77    #[serde(flatten)]
78    pub shared: ScrapeShared,
79    #[serde(flatten)]
80    pub data: T,
81}
82
83impl<T: ScrapeStory> std::ops::Deref for GenericScrape<T> {
84    type Target = ScrapeShared;
85    fn deref(&self) -> &Self::Target {
86        &self.shared
87    }
88}
89
90impl<T: ScrapeStory> std::ops::DerefMut for GenericScrape<T> {
91    fn deref_mut(&mut self) -> &mut Self::Target {
92        &mut self.shared
93    }
94}
95
96impl<T: ScrapeStory> GenericScrape<T> {
97    pub fn merge_generic(&mut self, _other: Self) {}
98}
99
100macro_rules! scrape_story {
101    ( $name:ident { $( $id:ident : $type:ty ),* $(,)? } ) => {
102        #[derive(Serialize, Deserialize, Clone, Debug, Default)]
103        pub struct $name {
104            $( pub $id : $type ),*
105        }
106
107        impl $name {
108            #[allow(clippy::too_many_arguments)]
109            pub fn new<'a, S: Clone + Into<Cow<'a, str>>>(id: S, date: StoryDate, raw_title: S, url: StoryUrl, $( $id: $type ),*) -> GenericScrape<$name> {
110                GenericScrape {
111                    shared: ScrapeShared {
112                        id: ScrapeId::new(<$name as ScrapeStory>::TYPE, None, id.into().into()), date, raw_title: raw_title.into().into(), url
113                    },
114                    data: $name {
115                        $($id),*
116                    }
117                }
118            }
119
120            #[allow(clippy::too_many_arguments)]
121            pub fn new_subsource<'a, S: Clone + Into<Cow<'a, str>>>(id: S, subsource: S, date: StoryDate, raw_title: S, url: StoryUrl, $( $id: $type ),*) -> GenericScrape<$name> {
122                GenericScrape {
123                    shared: ScrapeShared {
124                        id: ScrapeId::new(<$name as ScrapeStory>::TYPE, Some(subsource.into().into()), id.into().into()), date, raw_title: raw_title.into().into(), url
125                    },
126                    data: $name {
127                        $($id),*
128                    }
129                }
130            }
131
132            #[allow(clippy::too_many_arguments)]
133            pub fn new_with_defaults<'a, S: Clone + Into<Cow<'a, str>>>(id: S, date: StoryDate, raw_title: S, url: StoryUrl) -> GenericScrape<$name> {
134                GenericScrape {
135                    shared: ScrapeShared {
136                        id: ScrapeId::new(<$name as ScrapeStory>::TYPE, None, id.into().into()), date, raw_title: raw_title.into().into(), url
137                    },
138                    data: $name {
139                        $($id : Default::default() ),*
140                    }
141                }
142            }
143
144            #[allow(clippy::too_many_arguments)]
145            pub fn new_subsource_with_defaults<'a, S: Clone + Into<Cow<'a, str>>>(id: S, subsource: S, date: StoryDate, raw_title: S, url: StoryUrl) -> GenericScrape<$name> {
146                GenericScrape {
147                    shared: ScrapeShared {
148                        id: ScrapeId::new(<$name as ScrapeStory>::TYPE, Some(subsource.into().into()), id.into().into()), date, raw_title: raw_title.into().into(), url
149                    },
150                    data: $name {
151                        $($id : Default::default() ),*
152                    }
153                }
154            }
155        }
156    };
157}
158
159pub(crate) use scrape_story;