progscrape_scrapers/backends/
hacker_news.rs1use itertools::Itertools;
2use serde::{Deserialize, Serialize};
3use std::{
4 borrow::{Borrow, Cow},
5 collections::HashMap,
6};
7use tl::{HTMLTag, Parser, ParserOptions};
8
9use super::{
10 GenericScrape, ScrapeConfigSource, ScrapeCore, ScrapeSource, ScrapeSourceDef, ScrapeStory,
11 Scraper, scrape_story, utils::html::*,
12};
13use crate::types::*;
14
15pub struct HackerNews {}
16
17impl ScrapeSourceDef for HackerNews {
18 type Config = HackerNewsConfig;
19 type Scrape = HackerNewsStory;
20 type Scraper = HackerNewsScraper;
21
22 fn comments_url(id: &str, _subsource: Option<&str>) -> String {
23 format!("https://news.ycombinator.com/item?id={id}")
24 }
25
26 fn id_from_comments_url(url: &str) -> Option<(&str, Option<&str>)> {
27 Some((
28 url.strip_prefix("https://news.ycombinator.com/item?id=")?,
29 None,
30 ))
31 }
32
33 fn is_comments_host(host: &str) -> bool {
34 host.ends_with("news.ycombinator.com")
35 }
36}
37
38#[derive(Clone, Default, Serialize, Deserialize)]
39pub struct HackerNewsConfig {
40 homepage: String,
41 pages: Vec<String>,
42}
43
44impl ScrapeConfigSource for HackerNewsConfig {
45 fn subsources(&self) -> Vec<String> {
46 vec![]
47 }
48
49 fn provide_urls(&self, _: Vec<String>) -> Vec<String> {
50 self.pages
51 .iter()
52 .map(|s| format!("{}{}", self.homepage, s))
53 .collect_vec()
54 }
55}
56
57scrape_story! {
58 HackerNewsStory {
59 points: u32,
60 comments: u32,
61 position: u32,
62 }
63}
64
65impl ScrapeStory for HackerNewsStory {
66 const TYPE: ScrapeSource = ScrapeSource::HackerNews;
67
68 fn merge(&mut self, other: HackerNewsStory) {
69 self.points = std::cmp::max(self.points, other.points);
70 self.comments = std::cmp::max(self.comments, other.comments);
71 }
72}
73
74#[derive(Default)]
75pub struct HackerNewsScraper {}
76
77#[derive(Debug)]
78struct HackerNewsStoryLine {
79 id: String,
80 position: u32,
81 url: StoryUrl,
82 title: String,
83}
84
85#[derive(Debug)]
86struct HackerNewsInfoLine {
87 id: String,
88 comments: u32,
89 points: u32,
90 date: StoryDate,
91}
92
93#[derive(Debug)]
94enum HackerNewsNode {
95 StoryLine(HackerNewsStoryLine),
96 InfoLine(HackerNewsInfoLine),
97}
98
99impl HackerNewsScraper {
100 fn map_node_to_story(&self, p: &Parser, node: &HTMLTag) -> Result<HackerNewsNode, String> {
101 if find_first(p, node, "table").is_some() {
102 return Err("Story table cannot contain other tables".to_string());
103 }
104
105 fn extract_number(s: &str) -> Result<u32, String> {
106 str::parse(&s.replace(|c: char| !c.is_ascii_digit(), ""))
107 .map_err(|_| format!("Failed to parse number: '{s}'"))
108 }
109
110 if let Some(titleline) = find_first(p, node, ".titleline") {
111 if find_first(p, node, ".votelinks").is_none() {
112 return Err("Missing votelinks".to_string());
113 }
114 let first_link = find_first(p, titleline, "a")
115 .ok_or_else(|| "Failed to query first link".to_string())?;
116 let title = unescape_entities(first_link.inner_text(p).borrow());
117 let mut url = unescape_entities(
118 &get_attribute(p, first_link, "href")
119 .ok_or_else(|| "Failed to get href".to_string())?,
120 );
121 if url.starts_with("item?") {
122 url.insert_str(0, "https://news.ycombinator.com/");
123 }
124 let url = StoryUrl::parse(&url).ok_or(format!("Failed to parse URL {url}"))?;
125 let id =
126 get_attribute(p, node, "id").ok_or_else(|| "Failed to get id node".to_string())?;
127 let rank =
128 find_first(p, node, ".rank").ok_or_else(|| "Failed to get rank".to_string())?;
129 let position = rank
130 .inner_text(p)
131 .trim_end_matches('.')
132 .parse()
133 .or(Err("Failed to parse rank".to_string()))?;
134 Ok(HackerNewsNode::StoryLine(HackerNewsStoryLine {
135 id,
136 position,
137 url,
138 title,
139 }))
140 } else if find_first(p, node, ".subtext").is_some() {
141 let age_node =
142 find_first(p, node, ".age").ok_or_else(|| "Failed to query .age".to_string())?;
143 let mut date = get_attribute(p, age_node, "title")
144 .ok_or_else(|| "Failed to get age title".to_string())?;
145 if !date.ends_with("Z") {
146 date += "Z";
147 }
148 let date = StoryDate::parse_from_rfc3339_loose(&date)
149 .ok_or_else(|| format!("Failed to map date: {date}"))?;
150 let mut comments = None;
151 for node in html_tag_iterator(p, node.query_selector(p, "a")) {
152 let text = node.inner_text(p);
153 if text.contains("comment") {
154 comments = Some(extract_number(text.borrow())?);
155 } else if text.contains("discuss") {
156 comments = Some(0);
157 }
158 }
159 let score_node = find_first(p, node, ".score")
160 .ok_or_else(|| "Failed to query .score".to_string())?;
161 let id = get_attribute(p, score_node, "id")
162 .ok_or_else(|| "Missing ID on score node".to_string())?
163 .trim_start_matches("score_")
164 .into();
165 let points = extract_number(score_node.inner_text(p).borrow())?;
166 let comments = comments.ok_or_else(|| "Missing comment count".to_string())?;
167 Ok(HackerNewsNode::InfoLine(HackerNewsInfoLine {
168 id,
169 comments,
170 points,
171 date,
172 }))
173 } else {
174 Err("Unknown node type".to_string())
175 }
176 }
177
178 fn tags_from_title(
179 &self,
180 _args: &<HackerNews as ScrapeSourceDef>::Config,
181 title: &str,
182 ) -> Vec<&'static str> {
183 let mut tags = vec![];
184 if title.starts_with("Show HN") {
186 tags.push("show");
187 }
188 if title.starts_with("Ask HN") {
189 tags.push("ask");
190 }
191 if title.ends_with("[pdf]") {
192 tags.push("pdf");
193 }
194 if title.ends_with("[video]") {
195 tags.push("video");
196 }
197 tags
198 }
199}
200
201impl Scraper for HackerNewsScraper {
202 type Config = <HackerNews as ScrapeSourceDef>::Config;
203 type Output = <HackerNews as ScrapeSourceDef>::Scrape;
204
205 fn scrape(
206 &self,
207 _args: &HackerNewsConfig,
208 input: &str,
209 ) -> Result<(Vec<GenericScrape<Self::Output>>, Vec<String>), ScrapeError> {
210 let dom = tl::parse(input, ParserOptions::default())?;
211 let p = dom.parser();
212 let mut errors = vec![];
213 let mut story_lines = HashMap::new();
214 let mut info_lines = HashMap::new();
215 for node in html_tag_iterator(p, dom.query_selector("tr")) {
216 match self.map_node_to_story(p, node) {
217 Ok(HackerNewsNode::InfoLine(x)) => {
218 info_lines.insert(x.id.clone(), x);
219 }
220 Ok(HackerNewsNode::StoryLine(x)) => {
221 story_lines.insert(x.id.clone(), x);
222 }
223 Err(e) => {
224 errors.push(e);
225 }
226 }
227 }
228 let mut stories = vec![];
229 for (k, v) in story_lines {
230 let info = info_lines.remove(&k);
231 if let Some(info) = info {
232 let HackerNewsStoryLine {
233 url,
234 title: raw_title,
235 position,
236 ..
237 } = v;
238 let HackerNewsInfoLine {
239 date,
240 points,
241 comments,
242 ..
243 } = info;
244 let id = k;
245 stories.push(HackerNewsStory::new(
246 id, date, raw_title, url, points, comments, position,
247 ));
248 } else {
249 errors.push(format!("Unmatched story/info for id {k}"));
250 }
251 }
252 stories.sort_by_key(|x| x.data.position);
253 Ok((stories, errors))
254 }
255
256 fn extract_core<'a>(
257 &self,
258 args: &Self::Config,
259 input: &'a GenericScrape<Self::Output>,
260 ) -> ScrapeCore<'a> {
261 let tags = self
262 .tags_from_title(args, &input.shared.raw_title)
263 .into_iter()
264 .map(Cow::Borrowed)
265 .collect();
266 ScrapeCore {
267 source: &input.shared.id,
268 title: Cow::Borrowed(&input.shared.raw_title),
269 url: &input.shared.url,
270 date: input.shared.date,
271 rank: (input.data.position as usize).checked_sub(1),
272 tags,
273 }
274 }
275}