progscrape_scrapers/backends/
hacker_news.rs

1use itertools::Itertools;
2use serde::{Deserialize, Serialize};
3use std::{
4    borrow::{Borrow, Cow},
5    collections::HashMap,
6};
7use tl::{HTMLTag, Parser, ParserOptions};
8
9use super::{
10    GenericScrape, ScrapeConfigSource, ScrapeCore, ScrapeSource, ScrapeSourceDef, ScrapeStory,
11    Scraper, scrape_story, utils::html::*,
12};
13use crate::types::*;
14
15pub struct HackerNews {}
16
17impl ScrapeSourceDef for HackerNews {
18    type Config = HackerNewsConfig;
19    type Scrape = HackerNewsStory;
20    type Scraper = HackerNewsScraper;
21
22    fn comments_url(id: &str, _subsource: Option<&str>) -> String {
23        format!("https://news.ycombinator.com/item?id={id}")
24    }
25
26    fn id_from_comments_url(url: &str) -> Option<(&str, Option<&str>)> {
27        Some((
28            url.strip_prefix("https://news.ycombinator.com/item?id=")?,
29            None,
30        ))
31    }
32
33    fn is_comments_host(host: &str) -> bool {
34        host.ends_with("news.ycombinator.com")
35    }
36}
37
38#[derive(Clone, Default, Serialize, Deserialize)]
39pub struct HackerNewsConfig {
40    homepage: String,
41    pages: Vec<String>,
42}
43
44impl ScrapeConfigSource for HackerNewsConfig {
45    fn subsources(&self) -> Vec<String> {
46        vec![]
47    }
48
49    fn provide_urls(&self, _: Vec<String>) -> Vec<String> {
50        self.pages
51            .iter()
52            .map(|s| format!("{}{}", self.homepage, s))
53            .collect_vec()
54    }
55}
56
57scrape_story! {
58    HackerNewsStory {
59        points: u32,
60        comments: u32,
61        position: u32,
62    }
63}
64
65impl ScrapeStory for HackerNewsStory {
66    const TYPE: ScrapeSource = ScrapeSource::HackerNews;
67
68    fn merge(&mut self, other: HackerNewsStory) {
69        self.points = std::cmp::max(self.points, other.points);
70        self.comments = std::cmp::max(self.comments, other.comments);
71    }
72}
73
74#[derive(Default)]
75pub struct HackerNewsScraper {}
76
77#[derive(Debug)]
78struct HackerNewsStoryLine {
79    id: String,
80    position: u32,
81    url: StoryUrl,
82    title: String,
83}
84
85#[derive(Debug)]
86struct HackerNewsInfoLine {
87    id: String,
88    comments: u32,
89    points: u32,
90    date: StoryDate,
91}
92
93#[derive(Debug)]
94enum HackerNewsNode {
95    StoryLine(HackerNewsStoryLine),
96    InfoLine(HackerNewsInfoLine),
97}
98
99impl HackerNewsScraper {
100    fn map_node_to_story(&self, p: &Parser, node: &HTMLTag) -> Result<HackerNewsNode, String> {
101        if find_first(p, node, "table").is_some() {
102            return Err("Story table cannot contain other tables".to_string());
103        }
104
105        fn extract_number(s: &str) -> Result<u32, String> {
106            str::parse(&s.replace(|c: char| !c.is_ascii_digit(), ""))
107                .map_err(|_| format!("Failed to parse number: '{s}'"))
108        }
109
110        if let Some(titleline) = find_first(p, node, ".titleline") {
111            if find_first(p, node, ".votelinks").is_none() {
112                return Err("Missing votelinks".to_string());
113            }
114            let first_link = find_first(p, titleline, "a")
115                .ok_or_else(|| "Failed to query first link".to_string())?;
116            let title = unescape_entities(first_link.inner_text(p).borrow());
117            let mut url = unescape_entities(
118                &get_attribute(p, first_link, "href")
119                    .ok_or_else(|| "Failed to get href".to_string())?,
120            );
121            if url.starts_with("item?") {
122                url.insert_str(0, "https://news.ycombinator.com/");
123            }
124            let url = StoryUrl::parse(&url).ok_or(format!("Failed to parse URL {url}"))?;
125            let id =
126                get_attribute(p, node, "id").ok_or_else(|| "Failed to get id node".to_string())?;
127            let rank =
128                find_first(p, node, ".rank").ok_or_else(|| "Failed to get rank".to_string())?;
129            let position = rank
130                .inner_text(p)
131                .trim_end_matches('.')
132                .parse()
133                .or(Err("Failed to parse rank".to_string()))?;
134            Ok(HackerNewsNode::StoryLine(HackerNewsStoryLine {
135                id,
136                position,
137                url,
138                title,
139            }))
140        } else if find_first(p, node, ".subtext").is_some() {
141            let age_node =
142                find_first(p, node, ".age").ok_or_else(|| "Failed to query .age".to_string())?;
143            let mut date = get_attribute(p, age_node, "title")
144                .ok_or_else(|| "Failed to get age title".to_string())?;
145            if !date.ends_with("Z") {
146                date += "Z";
147            }
148            let date = StoryDate::parse_from_rfc3339_loose(&date)
149                .ok_or_else(|| format!("Failed to map date: {date}"))?;
150            let mut comments = None;
151            for node in html_tag_iterator(p, node.query_selector(p, "a")) {
152                let text = node.inner_text(p);
153                if text.contains("comment") {
154                    comments = Some(extract_number(text.borrow())?);
155                } else if text.contains("discuss") {
156                    comments = Some(0);
157                }
158            }
159            let score_node = find_first(p, node, ".score")
160                .ok_or_else(|| "Failed to query .score".to_string())?;
161            let id = get_attribute(p, score_node, "id")
162                .ok_or_else(|| "Missing ID on score node".to_string())?
163                .trim_start_matches("score_")
164                .into();
165            let points = extract_number(score_node.inner_text(p).borrow())?;
166            let comments = comments.ok_or_else(|| "Missing comment count".to_string())?;
167            Ok(HackerNewsNode::InfoLine(HackerNewsInfoLine {
168                id,
169                comments,
170                points,
171                date,
172            }))
173        } else {
174            Err("Unknown node type".to_string())
175        }
176    }
177
178    fn tags_from_title(
179        &self,
180        _args: &<HackerNews as ScrapeSourceDef>::Config,
181        title: &str,
182    ) -> Vec<&'static str> {
183        let mut tags = vec![];
184        // TODO: Strip years [ie: (2005)] from end of title
185        if title.starts_with("Show HN") {
186            tags.push("show");
187        }
188        if title.starts_with("Ask HN") {
189            tags.push("ask");
190        }
191        if title.ends_with("[pdf]") {
192            tags.push("pdf");
193        }
194        if title.ends_with("[video]") {
195            tags.push("video");
196        }
197        tags
198    }
199}
200
201impl Scraper for HackerNewsScraper {
202    type Config = <HackerNews as ScrapeSourceDef>::Config;
203    type Output = <HackerNews as ScrapeSourceDef>::Scrape;
204
205    fn scrape(
206        &self,
207        _args: &HackerNewsConfig,
208        input: &str,
209    ) -> Result<(Vec<GenericScrape<Self::Output>>, Vec<String>), ScrapeError> {
210        let dom = tl::parse(input, ParserOptions::default())?;
211        let p = dom.parser();
212        let mut errors = vec![];
213        let mut story_lines = HashMap::new();
214        let mut info_lines = HashMap::new();
215        for node in html_tag_iterator(p, dom.query_selector("tr")) {
216            match self.map_node_to_story(p, node) {
217                Ok(HackerNewsNode::InfoLine(x)) => {
218                    info_lines.insert(x.id.clone(), x);
219                }
220                Ok(HackerNewsNode::StoryLine(x)) => {
221                    story_lines.insert(x.id.clone(), x);
222                }
223                Err(e) => {
224                    errors.push(e);
225                }
226            }
227        }
228        let mut stories = vec![];
229        for (k, v) in story_lines {
230            let info = info_lines.remove(&k);
231            if let Some(info) = info {
232                let HackerNewsStoryLine {
233                    url,
234                    title: raw_title,
235                    position,
236                    ..
237                } = v;
238                let HackerNewsInfoLine {
239                    date,
240                    points,
241                    comments,
242                    ..
243                } = info;
244                let id = k;
245                stories.push(HackerNewsStory::new(
246                    id, date, raw_title, url, points, comments, position,
247                ));
248            } else {
249                errors.push(format!("Unmatched story/info for id {k}"));
250            }
251        }
252        stories.sort_by_key(|x| x.data.position);
253        Ok((stories, errors))
254    }
255
256    fn extract_core<'a>(
257        &self,
258        args: &Self::Config,
259        input: &'a GenericScrape<Self::Output>,
260    ) -> ScrapeCore<'a> {
261        let tags = self
262            .tags_from_title(args, &input.shared.raw_title)
263            .into_iter()
264            .map(Cow::Borrowed)
265            .collect();
266        ScrapeCore {
267            source: &input.shared.id,
268            title: Cow::Borrowed(&input.shared.raw_title),
269            url: &input.shared.url,
270            date: input.shared.date,
271            rank: (input.data.position as usize).checked_sub(1),
272            tags,
273        }
274    }
275}