progscrape_scrapers/backends/
slashdot.rs1use std::{
2 borrow::{Borrow, Cow},
3 collections::HashSet,
4 time::SystemTime,
5};
6
7use chrono::{DateTime, TimeZone, Utc};
8use itertools::Itertools;
9use serde::{Deserialize, Serialize};
10use tl::{HTMLTag, Parser, ParserOptions};
11use url::Url;
12
13use crate::types::*;
14
15use super::{
16 GenericScrape, ScrapeConfigSource, ScrapeCore, ScrapeSource, ScrapeSourceDef, ScrapeStory,
17 Scraper, scrape_story, utils::html::*,
18};
19
20pub struct Slashdot {}
21
22impl ScrapeSourceDef for Slashdot {
23 type Config = SlashdotConfig;
24 type Scrape = SlashdotStory;
25 type Scraper = SlashdotScraper;
26
27 fn comments_url(id: &str, _subsource: Option<&str>) -> String {
28 format!("https://tech.slashdot.org/story/{id}/")
29 }
30
31 fn id_from_comments_url(url: &str) -> Option<(&str, Option<&str>)> {
32 let url = url.trim_end_matches('/');
33 Some((url.strip_prefix("https://tech.slashdot.org/story/")?, None))
34 }
35
36 fn is_comments_host(host: &str) -> bool {
37 host.ends_with("slashdot.org")
38 }
39}
40
41#[derive(Clone, Default, Serialize, Deserialize)]
42pub struct SlashdotConfig {
43 homepage: String,
44 tag_allowlist: HashSet<String>,
45}
46
47impl ScrapeConfigSource for SlashdotConfig {
48 fn subsources(&self) -> Vec<String> {
49 vec![]
50 }
51
52 fn provide_urls(&self, _: Vec<String>) -> Vec<String> {
53 vec![self.homepage.clone()]
54 }
55}
56
57scrape_story! {
58 SlashdotStory {
59 num_comments: u32,
60 tags: Vec<String>,
61 }
62}
63
64impl ScrapeStory for SlashdotStory {
65 const TYPE: ScrapeSource = ScrapeSource::Slashdot;
66
67 fn merge(&mut self, other: Self) {
68 self.num_comments = std::cmp::max(self.num_comments, other.num_comments);
69 }
70}
71
72#[derive(Default)]
73pub struct SlashdotScraper {}
74
75impl SlashdotScraper {
76 fn parse_time(date: &str) -> Result<StoryDate, String> {
77 let tz = chrono_tz::US::Eastern;
82 let res = tz.from_utc_datetime(&DateTime::<Utc>::from(SystemTime::now()).naive_utc());
83
84 let date = format!(
86 "{} {}",
87 date.trim_start_matches("on ").replace(['@', ','], ""),
88 res.format("%z")
89 );
90
91 let day_of_week = ["%A ", ""];
95 let day = ["%d", "%e"];
96 let am_pm = ["%p", "%P"];
97
98 for ((day_of_week, day), am_pm) in day_of_week
100 .iter()
101 .cartesian_product(day)
102 .cartesian_product(am_pm)
103 {
104 let pattern = format!("{day_of_week}%B {day} %Y %I:%M{am_pm} %z");
105 if let Some(date) = StoryDate::from_string(&date, &pattern) {
106 return Ok(date);
107 }
108 }
109
110 Err(format!("Failed to parse date: {date}"))
111 }
112
113 fn parse_topic(href: &str) -> Option<String> {
114 let base = Url::parse("https://slashdot.org").expect("Failed to parse base URL");
115 let url = base.join(href);
116 if let Ok(url) = url {
117 if let Some((_, value)) = url.query_pairs().find(|(k, _)| k == "fhfilter") {
118 return Some(value.into());
119 }
120 }
121 None
122 }
123
124 fn map_story(
125 p: &Parser,
126 article: &HTMLTag,
127 ) -> Result<GenericScrape<<Self as Scraper>::Output>, String> {
128 let title = find_first(p, article, ".story-title").ok_or("Missing .story-title")?;
129 let mut links = html_tag_iterator(p, title.query_selector(p, "a"));
130 let story_link = links.next().ok_or("Missing story link")?;
131 let raw_title = unescape_entities(story_link.inner_text(p).borrow());
132 if raw_title.len() < 5 {
133 return Err(format!("Title was too short: {raw_title}"));
134 }
135 let story_url =
136 get_attribute(p, story_link, "href").ok_or_else(|| "Missing story href".to_string())?;
137 let (_, b) = story_url
138 .split_once("/story/")
139 .ok_or(format!("Invalid link format: {story_url}"))?;
140 let id = b.splitn(5, '/').take(4).collect::<Vec<_>>();
141 if id.len() != 4 {
142 return Err(format!("Invalid link format: {story_url}"));
143 }
144 let id = id.join("/");
145
146 let external_link = links.next().ok_or("Missing external link")?;
147 let href = unescape_entities(
148 &get_attribute(p, external_link, "href").ok_or_else(|| "Missing href".to_string())?,
149 );
150 let url = StoryUrl::parse(&href).ok_or(format!("Invalid href: {href}"))?;
151
152 let num_comments = if let Some(comments) = find_first(p, article, ".comment-bubble") {
154 comments
155 .inner_text(p)
156 .parse()
157 .map_err(|_e| "Failed to parse number of comments")?
158 } else {
159 0
160 };
161
162 let topics = find_first(p, article, ".topic").ok_or_else(|| "Mising topics".to_string())?;
163 let mut tags = vec![];
164 for topic in html_tag_iterator(p, topics.query_selector(p, "a")) {
165 if let Some(topic_href) = get_attribute(p, topic, "href") {
166 if let Some(topic) = Self::parse_topic(&topic_href) {
167 tags.push(topic);
168 }
169 }
170 }
171
172 let date =
173 find_first(p, article, "time").ok_or_else(|| "Could not locate time".to_string())?;
174 let date = Self::parse_time(&date.inner_text(p))?;
175
176 Ok(SlashdotStory::new(
177 id,
178 date,
179 raw_title,
180 url,
181 num_comments,
182 tags,
183 ))
184 }
185}
186
187impl Scraper for SlashdotScraper {
188 type Config = <Slashdot as ScrapeSourceDef>::Config;
189 type Output = <Slashdot as ScrapeSourceDef>::Scrape;
190
191 fn scrape(
192 &self,
193 _args: &Self::Config,
194 input: &str,
195 ) -> Result<(Vec<GenericScrape<Self::Output>>, Vec<String>), ScrapeError> {
196 let dom = tl::parse(input, ParserOptions::default())?;
197 let p = dom.parser();
198 let mut errors = vec![];
199 let mut v = vec![];
200
201 for article in html_tag_iterator(p, dom.query_selector("article.article")) {
202 match Self::map_story(p, article) {
203 Ok(s) => v.push(s),
204 Err(e) => errors.push(e),
205 }
206 }
207
208 Ok((v, errors))
209 }
210
211 fn extract_core<'a>(
212 &self,
213 args: &Self::Config,
214 input: &'a GenericScrape<Self::Output>,
215 ) -> ScrapeCore<'a> {
216 let mut tags = vec![];
217 for tag in &input.data.tags {
218 if args.tag_allowlist.contains(tag) {
219 tags.push(Cow::Borrowed(tag.as_str()));
220 }
221 }
222
223 ScrapeCore {
224 source: &input.shared.id,
225 date: input.shared.date,
226 title: Cow::Borrowed(&input.shared.raw_title),
227 url: &input.shared.url,
228 rank: None,
229 tags,
230 }
231 }
232}
233
234#[cfg(test)]
235pub mod test {
236 use super::*;
237 use rstest::*;
238
239 #[rstest]
240 #[case("on Monday January 09, 2023 @08:25PM")]
241 #[case("on Wednesday January 1, 2020 @11:00AM")]
242 #[case("on Wednesday January 1, 2020 @12:00AM")]
243 #[case("on Wednesday January 1, 2020 @12:30PM")]
244 #[case("on January 1, 2020 @12:30PM")]
245 fn test_date_parse(#[case] s: &str) {
246 SlashdotScraper::parse_time(s).expect("Expected this to parse");
247 }
248
249 #[rstest]
251 #[case("https://slashdot.org/index2.pl?fhfilter=business", Some("business"))]
252 #[case("//slashdot.org/index2.pl?fhfilter=business", Some("business"))]
253 #[case(
254 "//web.archive.org/web/20180313000356/https://slashdot.org/index2.pl?fhfilter=business",
255 Some("business")
256 )]
257 fn test_extract_topic(#[case] a: &str, #[case] b: Option<&str>) {
258 assert_eq!(b.map(String::from), SlashdotScraper::parse_topic(a));
259 }
260}