progscrape_scrapers/backends/
hacker_news.rs

1use itertools::Itertools;
2use serde::{Deserialize, Serialize};
3use std::{
4    borrow::{Borrow, Cow},
5    collections::HashMap,
6};
7use tl::{HTMLTag, Parser, ParserOptions};
8
9use super::{
10    scrape_story, utils::html::*, GenericScrape, ScrapeConfigSource, ScrapeCore, ScrapeShared,
11    ScrapeSource, ScrapeSourceDef, ScrapeStory, Scraper,
12};
13use crate::types::*;
14
15pub struct HackerNews {}
16
17impl ScrapeSourceDef for HackerNews {
18    type Config = HackerNewsConfig;
19    type Scrape = HackerNewsStory;
20    type Scraper = HackerNewsScraper;
21
22    fn comments_url(id: &str, _subsource: Option<&str>) -> String {
23        format!("https://news.ycombinator.com/item?id={}", id)
24    }
25
26    fn id_from_comments_url(url: &str) -> Option<(&str, Option<&str>)> {
27        Some((
28            url.strip_prefix("https://news.ycombinator.com/item?id=")?,
29            None,
30        ))
31    }
32
33    fn is_comments_host(host: &str) -> bool {
34        host.ends_with("news.ycombinator.com")
35    }
36}
37
38#[derive(Clone, Default, Serialize, Deserialize)]
39pub struct HackerNewsConfig {
40    homepage: String,
41    pages: Vec<String>,
42}
43
44impl ScrapeConfigSource for HackerNewsConfig {
45    fn subsources(&self) -> Vec<String> {
46        vec![]
47    }
48
49    fn provide_urls(&self, _: Vec<String>) -> Vec<String> {
50        self.pages
51            .iter()
52            .map(|s| format!("{}{}", self.homepage, s))
53            .collect_vec()
54    }
55}
56
57scrape_story! {
58    HackerNewsStory {
59        points: u32,
60        comments: u32,
61        position: u32,
62    }
63}
64
65impl ScrapeStory for HackerNewsStory {
66    const TYPE: ScrapeSource = ScrapeSource::HackerNews;
67
68    fn merge(&mut self, other: HackerNewsStory) {
69        self.points = std::cmp::max(self.points, other.points);
70        self.comments = std::cmp::max(self.comments, other.comments);
71    }
72}
73
74#[derive(Default)]
75pub struct HackerNewsScraper {}
76
77#[derive(Debug)]
78struct HackerNewsStoryLine {
79    id: String,
80    position: u32,
81    url: StoryUrl,
82    title: String,
83}
84
85#[derive(Debug)]
86struct HackerNewsInfoLine {
87    id: String,
88    comments: u32,
89    points: u32,
90    date: StoryDate,
91}
92
93#[derive(Debug)]
94enum HackerNewsNode {
95    StoryLine(HackerNewsStoryLine),
96    InfoLine(HackerNewsInfoLine),
97}
98
99impl HackerNewsScraper {
100    fn map_node_to_story(&self, p: &Parser, node: &HTMLTag) -> Result<HackerNewsNode, String> {
101        if find_first(p, node, "table").is_some() {
102            return Err("Story table cannot contain other tables".to_string());
103        }
104
105        fn extract_number(s: &str) -> Result<u32, String> {
106            str::parse(&s.replace(|c: char| !c.is_ascii_digit(), ""))
107                .map_err(|_| format!("Failed to parse number: '{}'", s))
108        }
109
110        return if let Some(titleline) = find_first(p, node, ".titleline") {
111            if find_first(p, node, ".votelinks").is_none() {
112                return Err("Missing votelinks".to_string());
113            }
114            let first_link = find_first(p, titleline, "a")
115                .ok_or_else(|| "Failed to query first link".to_string())?;
116            let title = unescape_entities(first_link.inner_text(p).borrow());
117            let mut url = unescape_entities(
118                &get_attribute(p, first_link, "href")
119                    .ok_or_else(|| "Failed to get href".to_string())?,
120            );
121            if url.starts_with("item?") {
122                url.insert_str(0, "https://news.ycombinator.com/");
123            }
124            let url = StoryUrl::parse(&url).ok_or(format!("Failed to parse URL {}", url))?;
125            let id =
126                get_attribute(p, node, "id").ok_or_else(|| "Failed to get id node".to_string())?;
127            let rank =
128                find_first(p, node, ".rank").ok_or_else(|| "Failed to get rank".to_string())?;
129            let position = rank
130                .inner_text(p)
131                .trim_end_matches('.')
132                .parse()
133                .or(Err("Failed to parse rank".to_string()))?;
134            Ok(HackerNewsNode::StoryLine(HackerNewsStoryLine {
135                id,
136                position,
137                url,
138                title,
139            }))
140        } else if find_first(p, node, ".subtext").is_some() {
141            let age_node =
142                find_first(p, node, ".age").ok_or_else(|| "Failed to query .age".to_string())?;
143            let date = get_attribute(p, age_node, "title")
144                .ok_or_else(|| "Failed to get age title".to_string())?
145                + "Z";
146            let date = StoryDate::parse_from_rfc3339(&date)
147                .ok_or_else(|| "Failed to map date".to_string())?;
148            let mut comments = None;
149            for node in html_tag_iterator(p, node.query_selector(p, "a")) {
150                let text = node.inner_text(p);
151                if text.contains("comment") {
152                    comments = Some(extract_number(text.borrow())?);
153                } else if text.contains("discuss") {
154                    comments = Some(0);
155                }
156            }
157            let score_node = find_first(p, node, ".score")
158                .ok_or_else(|| "Failed to query .score".to_string())?;
159            let id = get_attribute(p, score_node, "id")
160                .ok_or_else(|| "Missing ID on score node".to_string())?
161                .trim_start_matches("score_")
162                .into();
163            let points = extract_number(score_node.inner_text(p).borrow())?;
164            let comments = comments.ok_or_else(|| "Missing comment count".to_string())?;
165            Ok(HackerNewsNode::InfoLine(HackerNewsInfoLine {
166                id,
167                comments,
168                points,
169                date,
170            }))
171        } else {
172            Err("Unknown node type".to_string())
173        };
174    }
175
176    fn tags_from_title(
177        &self,
178        _args: &<HackerNews as ScrapeSourceDef>::Config,
179        title: &str,
180    ) -> Vec<&'static str> {
181        let mut tags = vec![];
182        // TODO: Strip years [ie: (2005)] from end of title
183        if title.starts_with("Show HN") {
184            tags.push("show");
185        }
186        if title.starts_with("Ask HN") {
187            tags.push("ask");
188        }
189        if title.ends_with("[pdf]") {
190            tags.push("pdf");
191        }
192        if title.ends_with("[video]") {
193            tags.push("video");
194        }
195        tags
196    }
197}
198
199impl Scraper for HackerNewsScraper {
200    type Config = <HackerNews as ScrapeSourceDef>::Config;
201    type Output = <HackerNews as ScrapeSourceDef>::Scrape;
202
203    fn scrape(
204        &self,
205        _args: &HackerNewsConfig,
206        input: &str,
207    ) -> Result<(Vec<GenericScrape<Self::Output>>, Vec<String>), ScrapeError> {
208        let dom = tl::parse(input, ParserOptions::default())?;
209        let p = dom.parser();
210        let mut errors = vec![];
211        let mut story_lines = HashMap::new();
212        let mut info_lines = HashMap::new();
213        for node in html_tag_iterator(p, dom.query_selector("tr")) {
214            match self.map_node_to_story(p, node) {
215                Ok(HackerNewsNode::InfoLine(x)) => {
216                    info_lines.insert(x.id.clone(), x);
217                }
218                Ok(HackerNewsNode::StoryLine(x)) => {
219                    story_lines.insert(x.id.clone(), x);
220                }
221                Err(e) => {
222                    errors.push(e);
223                }
224            }
225        }
226        let mut stories = vec![];
227        for (k, v) in story_lines {
228            let info = info_lines.remove(&k);
229            if let Some(info) = info {
230                let HackerNewsStoryLine {
231                    url,
232                    title: raw_title,
233                    position,
234                    ..
235                } = v;
236                let HackerNewsInfoLine {
237                    date,
238                    points,
239                    comments,
240                    ..
241                } = info;
242                let id = k;
243                stories.push(HackerNewsStory::new(
244                    id, date, raw_title, url, points, comments, position,
245                ));
246            } else {
247                errors.push(format!("Unmatched story/info for id {}", k));
248            }
249        }
250        stories.sort_by_key(|x| x.data.position);
251        Ok((stories, errors))
252    }
253
254    fn extract_core<'a>(
255        &self,
256        args: &Self::Config,
257        input: &'a GenericScrape<Self::Output>,
258    ) -> ScrapeCore<'a> {
259        let tags = self
260            .tags_from_title(args, &input.shared.raw_title)
261            .into_iter()
262            .map(Cow::Borrowed)
263            .collect();
264        ScrapeCore {
265            source: &input.shared.id,
266            title: Cow::Borrowed(&input.shared.raw_title),
267            url: &input.shared.url,
268            date: input.shared.date,
269            rank: (input.data.position as usize).checked_sub(1),
270            tags,
271        }
272    }
273}