progscrape_scrapers/backends/
hacker_news.rs1use itertools::Itertools;
2use serde::{Deserialize, Serialize};
3use std::{
4 borrow::{Borrow, Cow},
5 collections::HashMap,
6};
7use tl::{HTMLTag, Parser, ParserOptions};
8
9use super::{
10 scrape_story, utils::html::*, GenericScrape, ScrapeConfigSource, ScrapeCore, ScrapeShared,
11 ScrapeSource, ScrapeSourceDef, ScrapeStory, Scraper,
12};
13use crate::types::*;
14
15pub struct HackerNews {}
16
17impl ScrapeSourceDef for HackerNews {
18 type Config = HackerNewsConfig;
19 type Scrape = HackerNewsStory;
20 type Scraper = HackerNewsScraper;
21
22 fn comments_url(id: &str, _subsource: Option<&str>) -> String {
23 format!("https://news.ycombinator.com/item?id={}", id)
24 }
25
26 fn id_from_comments_url(url: &str) -> Option<(&str, Option<&str>)> {
27 Some((
28 url.strip_prefix("https://news.ycombinator.com/item?id=")?,
29 None,
30 ))
31 }
32
33 fn is_comments_host(host: &str) -> bool {
34 host.ends_with("news.ycombinator.com")
35 }
36}
37
38#[derive(Clone, Default, Serialize, Deserialize)]
39pub struct HackerNewsConfig {
40 homepage: String,
41 pages: Vec<String>,
42}
43
44impl ScrapeConfigSource for HackerNewsConfig {
45 fn subsources(&self) -> Vec<String> {
46 vec![]
47 }
48
49 fn provide_urls(&self, _: Vec<String>) -> Vec<String> {
50 self.pages
51 .iter()
52 .map(|s| format!("{}{}", self.homepage, s))
53 .collect_vec()
54 }
55}
56
57scrape_story! {
58 HackerNewsStory {
59 points: u32,
60 comments: u32,
61 position: u32,
62 }
63}
64
65impl ScrapeStory for HackerNewsStory {
66 const TYPE: ScrapeSource = ScrapeSource::HackerNews;
67
68 fn merge(&mut self, other: HackerNewsStory) {
69 self.points = std::cmp::max(self.points, other.points);
70 self.comments = std::cmp::max(self.comments, other.comments);
71 }
72}
73
74#[derive(Default)]
75pub struct HackerNewsScraper {}
76
77#[derive(Debug)]
78struct HackerNewsStoryLine {
79 id: String,
80 position: u32,
81 url: StoryUrl,
82 title: String,
83}
84
85#[derive(Debug)]
86struct HackerNewsInfoLine {
87 id: String,
88 comments: u32,
89 points: u32,
90 date: StoryDate,
91}
92
93#[derive(Debug)]
94enum HackerNewsNode {
95 StoryLine(HackerNewsStoryLine),
96 InfoLine(HackerNewsInfoLine),
97}
98
99impl HackerNewsScraper {
100 fn map_node_to_story(&self, p: &Parser, node: &HTMLTag) -> Result<HackerNewsNode, String> {
101 if find_first(p, node, "table").is_some() {
102 return Err("Story table cannot contain other tables".to_string());
103 }
104
105 fn extract_number(s: &str) -> Result<u32, String> {
106 str::parse(&s.replace(|c: char| !c.is_ascii_digit(), ""))
107 .map_err(|_| format!("Failed to parse number: '{}'", s))
108 }
109
110 return if let Some(titleline) = find_first(p, node, ".titleline") {
111 if find_first(p, node, ".votelinks").is_none() {
112 return Err("Missing votelinks".to_string());
113 }
114 let first_link = find_first(p, titleline, "a")
115 .ok_or_else(|| "Failed to query first link".to_string())?;
116 let title = unescape_entities(first_link.inner_text(p).borrow());
117 let mut url = unescape_entities(
118 &get_attribute(p, first_link, "href")
119 .ok_or_else(|| "Failed to get href".to_string())?,
120 );
121 if url.starts_with("item?") {
122 url.insert_str(0, "https://news.ycombinator.com/");
123 }
124 let url = StoryUrl::parse(&url).ok_or(format!("Failed to parse URL {}", url))?;
125 let id =
126 get_attribute(p, node, "id").ok_or_else(|| "Failed to get id node".to_string())?;
127 let rank =
128 find_first(p, node, ".rank").ok_or_else(|| "Failed to get rank".to_string())?;
129 let position = rank
130 .inner_text(p)
131 .trim_end_matches('.')
132 .parse()
133 .or(Err("Failed to parse rank".to_string()))?;
134 Ok(HackerNewsNode::StoryLine(HackerNewsStoryLine {
135 id,
136 position,
137 url,
138 title,
139 }))
140 } else if find_first(p, node, ".subtext").is_some() {
141 let age_node =
142 find_first(p, node, ".age").ok_or_else(|| "Failed to query .age".to_string())?;
143 let date = get_attribute(p, age_node, "title")
144 .ok_or_else(|| "Failed to get age title".to_string())?
145 + "Z";
146 let date = StoryDate::parse_from_rfc3339(&date)
147 .ok_or_else(|| "Failed to map date".to_string())?;
148 let mut comments = None;
149 for node in html_tag_iterator(p, node.query_selector(p, "a")) {
150 let text = node.inner_text(p);
151 if text.contains("comment") {
152 comments = Some(extract_number(text.borrow())?);
153 } else if text.contains("discuss") {
154 comments = Some(0);
155 }
156 }
157 let score_node = find_first(p, node, ".score")
158 .ok_or_else(|| "Failed to query .score".to_string())?;
159 let id = get_attribute(p, score_node, "id")
160 .ok_or_else(|| "Missing ID on score node".to_string())?
161 .trim_start_matches("score_")
162 .into();
163 let points = extract_number(score_node.inner_text(p).borrow())?;
164 let comments = comments.ok_or_else(|| "Missing comment count".to_string())?;
165 Ok(HackerNewsNode::InfoLine(HackerNewsInfoLine {
166 id,
167 comments,
168 points,
169 date,
170 }))
171 } else {
172 Err("Unknown node type".to_string())
173 };
174 }
175
176 fn tags_from_title(
177 &self,
178 _args: &<HackerNews as ScrapeSourceDef>::Config,
179 title: &str,
180 ) -> Vec<&'static str> {
181 let mut tags = vec![];
182 if title.starts_with("Show HN") {
184 tags.push("show");
185 }
186 if title.starts_with("Ask HN") {
187 tags.push("ask");
188 }
189 if title.ends_with("[pdf]") {
190 tags.push("pdf");
191 }
192 if title.ends_with("[video]") {
193 tags.push("video");
194 }
195 tags
196 }
197}
198
199impl Scraper for HackerNewsScraper {
200 type Config = <HackerNews as ScrapeSourceDef>::Config;
201 type Output = <HackerNews as ScrapeSourceDef>::Scrape;
202
203 fn scrape(
204 &self,
205 _args: &HackerNewsConfig,
206 input: &str,
207 ) -> Result<(Vec<GenericScrape<Self::Output>>, Vec<String>), ScrapeError> {
208 let dom = tl::parse(input, ParserOptions::default())?;
209 let p = dom.parser();
210 let mut errors = vec![];
211 let mut story_lines = HashMap::new();
212 let mut info_lines = HashMap::new();
213 for node in html_tag_iterator(p, dom.query_selector("tr")) {
214 match self.map_node_to_story(p, node) {
215 Ok(HackerNewsNode::InfoLine(x)) => {
216 info_lines.insert(x.id.clone(), x);
217 }
218 Ok(HackerNewsNode::StoryLine(x)) => {
219 story_lines.insert(x.id.clone(), x);
220 }
221 Err(e) => {
222 errors.push(e);
223 }
224 }
225 }
226 let mut stories = vec![];
227 for (k, v) in story_lines {
228 let info = info_lines.remove(&k);
229 if let Some(info) = info {
230 let HackerNewsStoryLine {
231 url,
232 title: raw_title,
233 position,
234 ..
235 } = v;
236 let HackerNewsInfoLine {
237 date,
238 points,
239 comments,
240 ..
241 } = info;
242 let id = k;
243 stories.push(HackerNewsStory::new(
244 id, date, raw_title, url, points, comments, position,
245 ));
246 } else {
247 errors.push(format!("Unmatched story/info for id {}", k));
248 }
249 }
250 stories.sort_by_key(|x| x.data.position);
251 Ok((stories, errors))
252 }
253
254 fn extract_core<'a>(
255 &self,
256 args: &Self::Config,
257 input: &'a GenericScrape<Self::Output>,
258 ) -> ScrapeCore<'a> {
259 let tags = self
260 .tags_from_title(args, &input.shared.raw_title)
261 .into_iter()
262 .map(Cow::Borrowed)
263 .collect();
264 ScrapeCore {
265 source: &input.shared.id,
266 title: Cow::Borrowed(&input.shared.raw_title),
267 url: &input.shared.url,
268 date: input.shared.date,
269 rank: (input.data.position as usize).checked_sub(1),
270 tags,
271 }
272 }
273}