Skip to main content

fetchkit/fetchers/
hackernews.rs

1//! Hacker News thread fetcher
2//!
3//! Handles news.ycombinator.com/item?id={id} URLs, returning structured
4//! thread content via the HN Firebase API.
5
6use crate::client::FetchOptions;
7use crate::error::FetchError;
8use crate::fetchers::Fetcher;
9use crate::types::{FetchRequest, FetchResponse};
10use crate::DEFAULT_USER_AGENT;
11use async_trait::async_trait;
12use reqwest::header::{HeaderValue, USER_AGENT};
13use serde::Deserialize;
14use std::time::Duration;
15use url::Url;
16
17const API_TIMEOUT: Duration = Duration::from_secs(10);
18
19/// Max top-level comments to fetch
20const MAX_COMMENTS: usize = 20;
21
22/// Hacker News fetcher
23///
24/// Matches `news.ycombinator.com/item?id={id}`, returning structured
25/// thread content via the HN Firebase API.
26pub struct HackerNewsFetcher;
27
28impl HackerNewsFetcher {
29    pub fn new() -> Self {
30        Self
31    }
32
33    fn parse_url(url: &Url) -> Option<u64> {
34        let host = url.host_str()?;
35        if host != "news.ycombinator.com" {
36            return None;
37        }
38
39        let segments: Vec<&str> = url.path_segments().map(|s| s.collect()).unwrap_or_default();
40        if segments.first() != Some(&"item") {
41            return None;
42        }
43
44        url.query_pairs()
45            .find(|(k, _)| k == "id")
46            .and_then(|(_, v)| v.parse().ok())
47    }
48}
49
50impl Default for HackerNewsFetcher {
51    fn default() -> Self {
52        Self::new()
53    }
54}
55
56#[derive(Debug, Deserialize)]
57struct HNItem {
58    id: u64,
59    #[serde(rename = "type")]
60    item_type: Option<String>,
61    title: Option<String>,
62    text: Option<String>,
63    url: Option<String>,
64    by: Option<String>,
65    score: Option<i64>,
66    descendants: Option<u64>,
67    kids: Option<Vec<u64>>,
68}
69
70#[async_trait]
71impl Fetcher for HackerNewsFetcher {
72    fn name(&self) -> &'static str {
73        "hackernews"
74    }
75
76    fn matches(&self, url: &Url) -> bool {
77        Self::parse_url(url).is_some()
78    }
79
80    async fn fetch(
81        &self,
82        request: &FetchRequest,
83        options: &FetchOptions,
84    ) -> Result<FetchResponse, FetchError> {
85        let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;
86
87        let item_id = Self::parse_url(&url)
88            .ok_or_else(|| FetchError::FetcherError("Not a valid HN URL".to_string()))?;
89
90        let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
91        let mut client_builder = reqwest::Client::builder()
92            .connect_timeout(API_TIMEOUT)
93            .timeout(API_TIMEOUT)
94            .redirect(reqwest::redirect::Policy::limited(3));
95
96        if !options.respect_proxy_env {
97            client_builder = client_builder.no_proxy();
98        }
99
100        let client = client_builder
101            .build()
102            .map_err(FetchError::ClientBuildError)?;
103
104        let ua_header = HeaderValue::from_str(user_agent)
105            .unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));
106
107        // Fetch the item
108        let item = fetch_item(&client, &ua_header, item_id).await?;
109
110        // Fetch top-level comments
111        let comments = if let Some(kids) = &item.kids {
112            let mut comments = Vec::new();
113            for &kid_id in kids.iter().take(MAX_COMMENTS) {
114                if let Ok(comment) = fetch_item(&client, &ua_header, kid_id).await {
115                    // Fetch one level of replies
116                    let replies = if let Some(reply_ids) = &comment.kids {
117                        let mut replies = Vec::new();
118                        for &reply_id in reply_ids.iter().take(5) {
119                            if let Ok(reply) = fetch_item(&client, &ua_header, reply_id).await {
120                                replies.push(reply);
121                            }
122                        }
123                        replies
124                    } else {
125                        Vec::new()
126                    };
127                    comments.push((comment, replies));
128                }
129            }
130            comments
131        } else {
132            Vec::new()
133        };
134
135        let content = format_hn_response(&item, &comments);
136
137        Ok(FetchResponse {
138            url: request.url.clone(),
139            status_code: 200,
140            content_type: Some("text/markdown".to_string()),
141            format: Some("hackernews".to_string()),
142            content: Some(content),
143            ..Default::default()
144        })
145    }
146}
147
148async fn fetch_item(
149    client: &reqwest::Client,
150    ua: &HeaderValue,
151    id: u64,
152) -> Result<HNItem, FetchError> {
153    let url = format!("https://hacker-news.firebaseio.com/v0/item/{}.json", id);
154
155    let resp = client
156        .get(&url)
157        .header(USER_AGENT, ua.clone())
158        .send()
159        .await
160        .map_err(FetchError::from_reqwest)?;
161
162    if !resp.status().is_success() {
163        return Err(FetchError::FetcherError(format!(
164            "HN API error: HTTP {}",
165            resp.status()
166        )));
167    }
168
169    resp.json()
170        .await
171        .map_err(|e| FetchError::FetcherError(format!("Failed to parse HN item: {}", e)))
172}
173
174fn format_hn_response(item: &HNItem, comments: &[(HNItem, Vec<HNItem>)]) -> String {
175    let mut out = String::new();
176
177    let item_type = item.item_type.as_deref().unwrap_or("story");
178
179    // Title
180    let title = item.title.as_deref().unwrap_or("Hacker News Item");
181    out.push_str(&format!("# {}\n\n", title));
182
183    // Metadata
184    out.push_str("## Info\n\n");
185    out.push_str(&format!("- **Type:** {}\n", item_type));
186
187    if let Some(by) = &item.by {
188        out.push_str(&format!("- **By:** {}\n", by));
189    }
190    if let Some(score) = item.score {
191        out.push_str(&format!("- **Score:** {}\n", score));
192    }
193    if let Some(descendants) = item.descendants {
194        out.push_str(&format!("- **Comments:** {}\n", descendants));
195    }
196    if let Some(url) = &item.url {
197        out.push_str(&format!("- **Link:** {}\n", url));
198    }
199    out.push_str(&format!(
200        "- **HN URL:** https://news.ycombinator.com/item?id={}\n",
201        item.id
202    ));
203
204    // Story text (for Ask HN, Show HN, etc.)
205    if let Some(text) = &item.text {
206        let cleaned = strip_html_tags(text);
207        out.push_str(&format!("\n{}\n", cleaned));
208    }
209
210    // Comments
211    if !comments.is_empty() {
212        let total = item.descendants.unwrap_or(0);
213        let shown: usize = comments.len() + comments.iter().map(|(_, r)| r.len()).sum::<usize>();
214        if shown < total as usize {
215            out.push_str(&format!("\n---\n\n## Comments ({} of {})\n", shown, total));
216        } else {
217            out.push_str(&format!("\n---\n\n## Comments ({})\n", shown));
218        }
219
220        for (comment, replies) in comments {
221            format_comment(&mut out, comment, 0);
222            for reply in replies {
223                format_comment(&mut out, reply, 1);
224            }
225        }
226    }
227
228    out
229}
230
231fn format_comment(out: &mut String, comment: &HNItem, depth: usize) {
232    let indent = "> ".repeat(depth);
233    let by = comment.by.as_deref().unwrap_or("anonymous");
234
235    out.push_str(&format!("\n{}**{}**\n\n", indent, by));
236
237    if let Some(text) = &comment.text {
238        let cleaned = strip_html_tags(text);
239        for line in cleaned.lines() {
240            out.push_str(&format!("{}{}\n", indent, line));
241        }
242        out.push('\n');
243    }
244}
245
246/// Simple HTML tag stripper for HN comment text
247fn strip_html_tags(html: &str) -> String {
248    let mut result = String::with_capacity(html.len());
249    let mut in_tag = false;
250
251    for c in html.chars() {
252        match c {
253            '<' => {
254                in_tag = true;
255                // Check for <p> tags -> newlines
256                let rest: String = html[html.len() - (html.len() - result.len())..]
257                    .chars()
258                    .take(3)
259                    .collect();
260                if rest.starts_with("p>") || rest.starts_with("br") {
261                    result.push('\n');
262                }
263            }
264            '>' => in_tag = false,
265            _ if !in_tag => result.push(c),
266            _ => {}
267        }
268    }
269
270    // Decode common HTML entities
271    result
272        .replace("&amp;", "&")
273        .replace("&lt;", "<")
274        .replace("&gt;", ">")
275        .replace("&quot;", "\"")
276        .replace("&#x27;", "'")
277        .replace("&#39;", "'")
278        .replace("&#x2F;", "/")
279}
280
281#[cfg(test)]
282mod tests {
283    use super::*;
284
285    #[test]
286    fn test_parse_hn_url() {
287        let url = Url::parse("https://news.ycombinator.com/item?id=12345").unwrap();
288        assert_eq!(HackerNewsFetcher::parse_url(&url), Some(12345));
289    }
290
291    #[test]
292    fn test_rejects_non_hn() {
293        let url = Url::parse("https://example.com/item?id=123").unwrap();
294        assert_eq!(HackerNewsFetcher::parse_url(&url), None);
295    }
296
297    #[test]
298    fn test_rejects_non_item_path() {
299        let url = Url::parse("https://news.ycombinator.com/newest").unwrap();
300        assert_eq!(HackerNewsFetcher::parse_url(&url), None);
301    }
302
303    #[test]
304    fn test_rejects_no_id() {
305        let url = Url::parse("https://news.ycombinator.com/item").unwrap();
306        assert_eq!(HackerNewsFetcher::parse_url(&url), None);
307    }
308
309    #[test]
310    fn test_fetcher_matches() {
311        let fetcher = HackerNewsFetcher::new();
312
313        let url = Url::parse("https://news.ycombinator.com/item?id=123").unwrap();
314        assert!(fetcher.matches(&url));
315
316        let url = Url::parse("https://example.com/item?id=123").unwrap();
317        assert!(!fetcher.matches(&url));
318    }
319
320    #[test]
321    fn test_strip_html_tags() {
322        assert_eq!(strip_html_tags("Hello <b>world</b>"), "Hello world");
323        assert_eq!(strip_html_tags("a &amp; b"), "a & b");
324    }
325
326    #[test]
327    fn test_format_hn_response() {
328        let item = HNItem {
329            id: 42,
330            item_type: Some("story".to_string()),
331            title: Some("Show HN: My Project".to_string()),
332            text: None,
333            url: Some("https://example.com".to_string()),
334            by: Some("pg".to_string()),
335            score: Some(100),
336            descendants: Some(5),
337            kids: None,
338        };
339
340        let output = format_hn_response(&item, &[]);
341
342        assert!(output.contains("# Show HN: My Project"));
343        assert!(output.contains("**By:** pg"));
344        assert!(output.contains("**Score:** 100"));
345        assert!(output.contains("https://example.com"));
346    }
347}