progscrape_scrapers/backends/
lobsters.rs1use std::collections::HashSet;
2
3use super::*;
4
5use roxmltree::Document;
6use serde::{Deserialize, Serialize};
7
8pub struct Lobsters {}
9
10impl ScrapeSourceDef for Lobsters {
11 type Config = LobstersConfig;
12 type Scrape = LobstersStory;
13 type Scraper = LobstersScraper;
14
15 fn comments_url(id: &str, _subsource: Option<&str>) -> String {
16 format!("https://lobste.rs/s/{id}/")
17 }
18
19 fn id_from_comments_url(url: &str) -> Option<(&str, Option<&str>)> {
20 let url = url.trim_end_matches('/');
21 Some((url.strip_prefix("https://lobste.rs/s/")?, None))
22 }
23
24 fn is_comments_host(host: &str) -> bool {
25 host.ends_with("lobste.rs")
26 }
27}
28
29#[derive(Clone, Default, Serialize, Deserialize)]
30pub struct LobstersConfig {
31 feed: String,
32 tag_denylist: HashSet<String>,
33}
34
35impl ScrapeConfigSource for LobstersConfig {
36 fn subsources(&self) -> Vec<String> {
37 vec![]
38 }
39
40 fn provide_urls(&self, _: Vec<String>) -> Vec<String> {
41 vec![self.feed.clone()]
42 }
43}
44
45scrape_story! {
46 LobstersStory {
47 num_comments: u32,
48 position: u32,
49 score: u32,
50 tags: Vec<String>,
51 }
52}
53
54impl ScrapeStory for LobstersStory {
55 const TYPE: ScrapeSource = ScrapeSource::Lobsters;
56
57 fn merge(&mut self, other: LobstersStory) {
58 self.score = std::cmp::max(self.score, other.score);
59 self.num_comments = std::cmp::max(self.num_comments, other.num_comments);
60 }
61}
62
63#[derive(Default)]
64pub struct LobstersScraper {}
65
66impl Scraper for LobstersScraper {
67 type Config = <Lobsters as ScrapeSourceDef>::Config;
68 type Output = <Lobsters as ScrapeSourceDef>::Scrape;
69
70 fn scrape(
71 &self,
72 _args: &Self::Config,
73 input: &str,
74 ) -> Result<(Vec<GenericScrape<Self::Output>>, Vec<String>), ScrapeError> {
75 let doc = Document::parse(input)?;
76 let rss = doc.root_element();
77 let mut warnings = vec![];
78 let mut stories = vec![];
79 for channel in rss.children() {
80 if channel.tag_name().name() == "channel" {
81 for (position, item) in channel
82 .children()
83 .filter(|item| item.tag_name().name() == "item")
84 .enumerate()
85 {
86 let mut raw_title = None;
87 let mut id = None;
88 let mut url = None;
89 let mut date = None;
90 let mut tags = vec![];
91 for subitem in item.children() {
92 if !subitem.is_element() {
93 continue;
94 }
95 match subitem.tag_name().name() {
96 "title" => raw_title = subitem.text().map(|s| s.to_owned()),
97 "guid" => {
98 id = subitem.text().map(|s| {
99 s.trim_start_matches("https://lobste.rs/s/").to_owned()
100 })
101 }
102 "link" => url = subitem.text().and_then(StoryUrl::parse),
103 "author" => {}
104 "pubDate" => {
105 date = subitem.text().and_then(StoryDate::parse_from_rfc2822)
106 }
107 "comments" => {}
108 "category" => drop(subitem.text().map(|s| tags.push(s.to_owned()))),
109 "description" => {}
110 x => warnings.push(format!("Unknown sub-node '{x}'")),
111 }
112 }
113 if let (Some(raw_title), Some(id), Some(url), Some(date)) =
114 (raw_title, id, url, date)
115 {
116 let position = position as u32 + 1;
117 let num_comments = 0;
118 let score = 0;
119 stories.push(LobstersStory::new(
120 id,
121 date,
122 raw_title,
123 url,
124 num_comments,
125 position,
126 score,
127 tags,
128 ));
129 } else {
130 warnings.push("Story did not contain all required fields".to_string());
131 }
132 }
133 }
134 }
135 Ok((stories, warnings))
136 }
137
138 fn extract_core<'a>(
139 &self,
140 args: &Self::Config,
141 input: &'a GenericScrape<Self::Output>,
142 ) -> ScrapeCore<'a> {
143 let mut tags = Vec::new();
144 for tag in &input.data.tags {
145 if args.tag_denylist.contains(tag) {
146 continue;
147 }
148 tags.push(Cow::Borrowed(tag.as_str()));
149 }
150
151 ScrapeCore {
152 source: &input.shared.id,
153 title: Cow::Borrowed(&input.shared.raw_title),
154 url: &input.shared.url,
155 date: input.shared.date,
156 tags,
157 rank: (input.data.position as usize).checked_sub(1),
158 }
159 }
160}