progscrape_scrapers/backends/
slashdot.rs

1use std::{
2    borrow::{Borrow, Cow},
3    collections::HashSet,
4    time::SystemTime,
5};
6
7use chrono::{DateTime, TimeZone, Utc};
8use itertools::Itertools;
9use serde::{Deserialize, Serialize};
10use tl::{HTMLTag, Parser, ParserOptions};
11use url::Url;
12
13use crate::types::*;
14
15use super::{
16    GenericScrape, ScrapeConfigSource, ScrapeCore, ScrapeSource, ScrapeSourceDef, ScrapeStory,
17    Scraper, scrape_story, utils::html::*,
18};
19
20pub struct Slashdot {}
21
22impl ScrapeSourceDef for Slashdot {
23    type Config = SlashdotConfig;
24    type Scrape = SlashdotStory;
25    type Scraper = SlashdotScraper;
26
27    fn comments_url(id: &str, _subsource: Option<&str>) -> String {
28        format!("https://tech.slashdot.org/story/{id}/")
29    }
30
31    fn id_from_comments_url(url: &str) -> Option<(&str, Option<&str>)> {
32        let url = url.trim_end_matches('/');
33        Some((url.strip_prefix("https://tech.slashdot.org/story/")?, None))
34    }
35
36    fn is_comments_host(host: &str) -> bool {
37        host.ends_with("slashdot.org")
38    }
39}
40
41#[derive(Clone, Default, Serialize, Deserialize)]
42pub struct SlashdotConfig {
43    homepage: String,
44    tag_allowlist: HashSet<String>,
45}
46
47impl ScrapeConfigSource for SlashdotConfig {
48    fn subsources(&self) -> Vec<String> {
49        vec![]
50    }
51
52    fn provide_urls(&self, _: Vec<String>) -> Vec<String> {
53        vec![self.homepage.clone()]
54    }
55}
56
57scrape_story! {
58    SlashdotStory {
59        num_comments: u32,
60        tags: Vec<String>,
61    }
62}
63
64impl ScrapeStory for SlashdotStory {
65    const TYPE: ScrapeSource = ScrapeSource::Slashdot;
66
67    fn merge(&mut self, other: Self) {
68        self.num_comments = std::cmp::max(self.num_comments, other.num_comments);
69    }
70}
71
72#[derive(Default)]
73pub struct SlashdotScraper {}
74
75impl SlashdotScraper {
76    fn parse_time(date: &str) -> Result<StoryDate, String> {
77        // Slashdot runs in EST5EDT (according to the latest slashcode release) if you
78        // are logged out. If we wanted to be more accurate, we could scrape the times from the
79        // RSS feed and correlate with the website but we're going to make do here instead.
80
81        let tz = chrono_tz::US::Eastern;
82        let res = tz.from_utc_datetime(&DateTime::<Utc>::from(SystemTime::now()).naive_utc());
83
84        // Clean up "on " prefix, @ signs and commas, then add the offset
85        let date = format!(
86            "{} {}",
87            date.trim_start_matches("on ").replace(['@', ','], ""),
88            res.format("%z")
89        );
90
91        // Expected at point: 'Monday January 09 2023 08:25PM -0500'
92
93        // https://docs.rs/chrono/latest/chrono/format/strftime/index.html
94        let day_of_week = ["%A ", ""];
95        let day = ["%d", "%e"];
96        let am_pm = ["%p", "%P"];
97
98        // Attempt to use multiple patterns to parse
99        for ((day_of_week, day), am_pm) in day_of_week
100            .iter()
101            .cartesian_product(day)
102            .cartesian_product(am_pm)
103        {
104            let pattern = format!("{day_of_week}%B {day} %Y %I:%M{am_pm} %z");
105            if let Some(date) = StoryDate::from_string(&date, &pattern) {
106                return Ok(date);
107            }
108        }
109
110        Err(format!("Failed to parse date: {date}"))
111    }
112
113    fn parse_topic(href: &str) -> Option<String> {
114        let base = Url::parse("https://slashdot.org").expect("Failed to parse base URL");
115        let url = base.join(href);
116        if let Ok(url) = url {
117            if let Some((_, value)) = url.query_pairs().find(|(k, _)| k == "fhfilter") {
118                return Some(value.into());
119            }
120        }
121        None
122    }
123
124    fn map_story(
125        p: &Parser,
126        article: &HTMLTag,
127    ) -> Result<GenericScrape<<Self as Scraper>::Output>, String> {
128        let title = find_first(p, article, ".story-title").ok_or("Missing .story-title")?;
129        let mut links = html_tag_iterator(p, title.query_selector(p, "a"));
130        let story_link = links.next().ok_or("Missing story link")?;
131        let raw_title = unescape_entities(story_link.inner_text(p).borrow());
132        if raw_title.len() < 5 {
133            return Err(format!("Title was too short: {raw_title}"));
134        }
135        let story_url =
136            get_attribute(p, story_link, "href").ok_or_else(|| "Missing story href".to_string())?;
137        let (_, b) = story_url
138            .split_once("/story/")
139            .ok_or(format!("Invalid link format: {story_url}"))?;
140        let id = b.splitn(5, '/').take(4).collect::<Vec<_>>();
141        if id.len() != 4 {
142            return Err(format!("Invalid link format: {story_url}"));
143        }
144        let id = id.join("/");
145
146        let external_link = links.next().ok_or("Missing external link")?;
147        let href = unescape_entities(
148            &get_attribute(p, external_link, "href").ok_or_else(|| "Missing href".to_string())?,
149        );
150        let url = StoryUrl::parse(&href).ok_or(format!("Invalid href: {href}"))?;
151
152        // This doesn't appear if there are no comments on a story, so we need to be flexible
153        let num_comments = if let Some(comments) = find_first(p, article, ".comment-bubble") {
154            comments
155                .inner_text(p)
156                .parse()
157                .map_err(|_e| "Failed to parse number of comments")?
158        } else {
159            0
160        };
161
162        let topics = find_first(p, article, ".topic").ok_or_else(|| "Mising topics".to_string())?;
163        let mut tags = vec![];
164        for topic in html_tag_iterator(p, topics.query_selector(p, "a")) {
165            if let Some(topic_href) = get_attribute(p, topic, "href") {
166                if let Some(topic) = Self::parse_topic(&topic_href) {
167                    tags.push(topic);
168                }
169            }
170        }
171
172        let date =
173            find_first(p, article, "time").ok_or_else(|| "Could not locate time".to_string())?;
174        let date = Self::parse_time(&date.inner_text(p))?;
175
176        Ok(SlashdotStory::new(
177            id,
178            date,
179            raw_title,
180            url,
181            num_comments,
182            tags,
183        ))
184    }
185}
186
187impl Scraper for SlashdotScraper {
188    type Config = <Slashdot as ScrapeSourceDef>::Config;
189    type Output = <Slashdot as ScrapeSourceDef>::Scrape;
190
191    fn scrape(
192        &self,
193        _args: &Self::Config,
194        input: &str,
195    ) -> Result<(Vec<GenericScrape<Self::Output>>, Vec<String>), ScrapeError> {
196        let dom = tl::parse(input, ParserOptions::default())?;
197        let p = dom.parser();
198        let mut errors = vec![];
199        let mut v = vec![];
200
201        for article in html_tag_iterator(p, dom.query_selector("article.article")) {
202            match Self::map_story(p, article) {
203                Ok(s) => v.push(s),
204                Err(e) => errors.push(e),
205            }
206        }
207
208        Ok((v, errors))
209    }
210
211    fn extract_core<'a>(
212        &self,
213        args: &Self::Config,
214        input: &'a GenericScrape<Self::Output>,
215    ) -> ScrapeCore<'a> {
216        let mut tags = vec![];
217        for tag in &input.data.tags {
218            if args.tag_allowlist.contains(tag) {
219                tags.push(Cow::Borrowed(tag.as_str()));
220            }
221        }
222
223        ScrapeCore {
224            source: &input.shared.id,
225            date: input.shared.date,
226            title: Cow::Borrowed(&input.shared.raw_title),
227            url: &input.shared.url,
228            rank: None,
229            tags,
230        }
231    }
232}
233
234#[cfg(test)]
235pub mod test {
236    use super::*;
237    use rstest::*;
238
239    #[rstest]
240    #[case("on Monday January 09, 2023 @08:25PM")]
241    #[case("on Wednesday January 1, 2020 @11:00AM")]
242    #[case("on Wednesday January 1, 2020 @12:00AM")]
243    #[case("on Wednesday January 1, 2020 @12:30PM")]
244    #[case("on January 1, 2020 @12:30PM")]
245    fn test_date_parse(#[case] s: &str) {
246        SlashdotScraper::parse_time(s).expect("Expected this to parse");
247    }
248
249    /// Test that we can extract the fhfilter tag in all cases.
250    #[rstest]
251    #[case("https://slashdot.org/index2.pl?fhfilter=business", Some("business"))]
252    #[case("//slashdot.org/index2.pl?fhfilter=business", Some("business"))]
253    #[case(
254        "//web.archive.org/web/20180313000356/https://slashdot.org/index2.pl?fhfilter=business",
255        Some("business")
256    )]
257    fn test_extract_topic(#[case] a: &str, #[case] b: Option<&str>) {
258        assert_eq!(b.map(String::from), SlashdotScraper::parse_topic(a));
259    }
260}
progscrape_scrapers/backends/slashdot.rs

progscrape_scrapers/backends/
slashdot.rs