progscrape_scrapers/backends/
lobsters.rs

1use std::collections::HashSet;
2
3use super::*;
4
5use roxmltree::Document;
6use serde::{Deserialize, Serialize};
7
8pub struct Lobsters {}
9
10impl ScrapeSourceDef for Lobsters {
11    type Config = LobstersConfig;
12    type Scrape = LobstersStory;
13    type Scraper = LobstersScraper;
14
15    fn comments_url(id: &str, _subsource: Option<&str>) -> String {
16        format!("https://lobste.rs/s/{id}/")
17    }
18
19    fn id_from_comments_url(url: &str) -> Option<(&str, Option<&str>)> {
20        let url = url.trim_end_matches('/');
21        Some((url.strip_prefix("https://lobste.rs/s/")?, None))
22    }
23
24    fn is_comments_host(host: &str) -> bool {
25        host.ends_with("lobste.rs")
26    }
27}
28
29#[derive(Clone, Default, Serialize, Deserialize)]
30pub struct LobstersConfig {
31    feed: String,
32    tag_denylist: HashSet<String>,
33}
34
35impl ScrapeConfigSource for LobstersConfig {
36    fn subsources(&self) -> Vec<String> {
37        vec![]
38    }
39
40    fn provide_urls(&self, _: Vec<String>) -> Vec<String> {
41        vec![self.feed.clone()]
42    }
43}
44
45scrape_story! {
46    LobstersStory {
47        num_comments: u32,
48        position: u32,
49        score: u32,
50        tags: Vec<String>,
51    }
52}
53
54impl ScrapeStory for LobstersStory {
55    const TYPE: ScrapeSource = ScrapeSource::Lobsters;
56
57    fn merge(&mut self, other: LobstersStory) {
58        self.score = std::cmp::max(self.score, other.score);
59        self.num_comments = std::cmp::max(self.num_comments, other.num_comments);
60    }
61}
62
63#[derive(Default)]
64pub struct LobstersScraper {}
65
66impl Scraper for LobstersScraper {
67    type Config = <Lobsters as ScrapeSourceDef>::Config;
68    type Output = <Lobsters as ScrapeSourceDef>::Scrape;
69
70    fn scrape(
71        &self,
72        _args: &Self::Config,
73        input: &str,
74    ) -> Result<(Vec<GenericScrape<Self::Output>>, Vec<String>), ScrapeError> {
75        let doc = Document::parse(input)?;
76        let rss = doc.root_element();
77        let mut warnings = vec![];
78        let mut stories = vec![];
79        for channel in rss.children() {
80            if channel.tag_name().name() == "channel" {
81                for (position, item) in channel
82                    .children()
83                    .filter(|item| item.tag_name().name() == "item")
84                    .enumerate()
85                {
86                    let mut raw_title = None;
87                    let mut id = None;
88                    let mut url = None;
89                    let mut date = None;
90                    let mut tags = vec![];
91                    for subitem in item.children() {
92                        if !subitem.is_element() {
93                            continue;
94                        }
95                        match subitem.tag_name().name() {
96                            "title" => raw_title = subitem.text().map(|s| s.to_owned()),
97                            "guid" => {
98                                id = subitem.text().map(|s| {
99                                    s.trim_start_matches("https://lobste.rs/s/").to_owned()
100                                })
101                            }
102                            "link" => url = subitem.text().and_then(StoryUrl::parse),
103                            "author" => {}
104                            "pubDate" => {
105                                date = subitem.text().and_then(StoryDate::parse_from_rfc2822)
106                            }
107                            "comments" => {}
108                            "category" => drop(subitem.text().map(|s| tags.push(s.to_owned()))),
109                            "description" => {}
110                            x => warnings.push(format!("Unknown sub-node '{x}'")),
111                        }
112                    }
113                    if let (Some(raw_title), Some(id), Some(url), Some(date)) =
114                        (raw_title, id, url, date)
115                    {
116                        let position = position as u32 + 1;
117                        let num_comments = 0;
118                        let score = 0;
119                        stories.push(LobstersStory::new(
120                            id,
121                            date,
122                            raw_title,
123                            url,
124                            num_comments,
125                            position,
126                            score,
127                            tags,
128                        ));
129                    } else {
130                        warnings.push("Story did not contain all required fields".to_string());
131                    }
132                }
133            }
134        }
135        Ok((stories, warnings))
136    }
137
138    fn extract_core<'a>(
139        &self,
140        args: &Self::Config,
141        input: &'a GenericScrape<Self::Output>,
142    ) -> ScrapeCore<'a> {
143        let mut tags = Vec::new();
144        for tag in &input.data.tags {
145            if args.tag_denylist.contains(tag) {
146                continue;
147            }
148            tags.push(Cow::Borrowed(tag.as_str()));
149        }
150
151        ScrapeCore {
152            source: &input.shared.id,
153            title: Cow::Borrowed(&input.shared.raw_title),
154            url: &input.shared.url,
155            date: input.shared.date,
156            tags,
157            rank: (input.data.position as usize).checked_sub(1),
158        }
159    }
160}