hacker_news/client/
html_client.rs

1use std::error::Error;
2use log;
3use std::collections::HashMap;
4use std::cell::RefCell;
5use lazy_static::lazy_static;
6use regex::Regex;
7use reqwest;
8use reqwest::blocking::ClientBuilder;
9use reqwest::header::HeaderValue;
10use reqwest::header::HeaderMap;
11use reqwest::cookie::Cookie;
12use reqwest::redirect::Policy;
13use scraper;
14use scraper::Html;
15use scraper::Selector;
16use scraper::ElementRef;
17use crate::error::HttpError;
18use crate::error::HnError;
19use crate::parser::HtmlParse;
20use crate::parser::ListingsParser;
21use crate::parser::CommentsParser;
22use crate::parser::extract_fnid;
23use crate::parser::comments::create_comment_tree;
24use crate::model::Id;
25use crate::model::Listing;
26use crate::model::Date;
27use crate::model::Thread;
28
29
30const URL_LOGIN: &str = "https://news.ycombinator.com/login";
31const URL_SUBMIT_FORM: &str = "https://news.ycombinator.com/submit";
32const URL_SUBMIT: &str = "https://news.ycombinator.com/r";
33
34lazy_static! {
35    static ref FNID_REGEX: Regex =  Regex::new(r#"<input.*value="(.+?)".*>"#).unwrap();
36}
37
38pub struct Client {
39    http_client: reqwest::blocking::Client,
40    cookie: RefCell<Option<(String, String)>>,
41}
42
43impl Client {
44
45    pub fn new() -> Self {
46        Self {
47            http_client: reqwest::blocking::Client::new(),
48            cookie: RefCell::new(None),
49        }
50    }
51
52    fn cookie(&self) -> Result<String, Box<dyn Error>> {
53        // Note: Chaining these causes a compiler error about dropping to early
54        let pair = self.cookie.borrow();
55        let pair = pair.as_ref().ok_or(HnError::UnauthenticatedError)?;
56
57        Ok(format!("{}={};", pair.0, pair.1))
58    }
59
60    pub fn submit(
61        &self,
62        title: String,
63        url: Option<String>,
64        text: Option<String>,
65    ) -> Result<(), Box<dyn Error>> {
66
67        let cookie_string = self.cookie()?;
68        let cookie: HeaderValue = cookie_string.parse()
69            .expect("Got a user cookie, but failed to parse it to a header");
70
71        let mut formdata = HashMap::new();
72        formdata.insert("fnid", self.get_fnid()?);
73        formdata.insert("fnop", "submit-page".to_string());
74        formdata.insert("url", url.unwrap_or_else(|| "".to_string()));
75        formdata.insert("text", text.unwrap_or_else(|| "".to_string()));
76        log::debug!("submit post body = {:?}", formdata);
77        formdata.insert("title", title);
78        
79        let req = self.http_client.post(URL_SUBMIT)
80            .header("Cookie", cookie)
81            .form(&formdata);
82        log::debug!("submit post request = {:?}", req);
83        let resp = req.send()?;
84        log::debug!("submit post response = {:?}", resp);
85        
86        Ok(())
87
88    }
89    
90    fn get_fnid(&self) -> Result<String, Box<dyn Error>> {
91        let cookie_string = self.cookie()?;
92        let cookie: HeaderValue = cookie_string.parse()
93            .expect("Got a user cookie, but failed to parse it to a header");
94    
95        let req = self.http_client
96            .get(URL_SUBMIT_FORM)
97            .header("Cookie", cookie);
98        log::debug!("submit form request = {:?}", req);
99        let resp = req.send()?;
100        log::debug!("submit form response = {:?}", resp);
101        let body = resp.text()?;
102        let dom = Html::parse_document(&body);
103        
104        // Underlying library doesn't implement std::error::Error on their
105        // Error structs, so I can't include it as the src error in my struct
106        let selector = match Selector::parse("input[name='fnid']") {
107            Err(_src) => {
108                return Err(Box::new(HnError::HtmlParsingError));
109            },
110            Ok(selector) => selector,
111        };
112    
113        let result: Vec<ElementRef> = dom.select(&selector).collect();
114        let el = match result.get(0) {
115            Some(el) => el,
116            None => {
117                return Err(Box::new(HnError::HtmlParsingError));
118            }
119        };
120        let fnid = extract_fnid(el)?;
121    
122        Ok(fnid)
123    }
124
125    pub fn login(&self, username: &str, password: &str) -> Result<(), Box<dyn Error>> {
126        let mut formdata = HashMap::new();
127        formdata.insert("acct", username);
128        formdata.insert("pw", password);
129        let goto = "newest".to_string();
130        formdata.insert("goto", &goto);
131
132        let mut headers = HeaderMap::new();
133        headers.insert("User-Agent", "hacker-news client/0.0.1".parse().unwrap());
134
135        // Login request requires no redirect on response, therefore we build a 
136        // new one rather than referencing self.http_client.
137        // TODO: Is there a better way to accomodate this?
138        let client = ClientBuilder::new()
139            .redirect(Policy::none())
140            .build()?;
141
142        // Send login request
143        let req = client.post(URL_LOGIN)
144            .headers(headers)
145            .form(&formdata);
146        log::debug!("login request = {:?}", req);
147        let resp = req.send()?;
148        if resp.status().as_u16() != 302 {
149            log::error!("login response = {:?}", resp);
150            return Err(Box::new(HnError::AuthenticationError));
151        }
152        log::debug!("login response = {:?}", resp);
153
154        // Store user session cookie
155        let cookies: Vec<Cookie> = resp.cookies().collect();
156        let cookie = cookies.get(0)
157            // .ok_or("Unable to retrieve user cookie")?;
158            .ok_or_else(|| {
159                log::error!("Unable to parse user cookie from succesful login response, \
160                    response = {:?}, cookies = {:?}", resp, cookies);
161                HnError::HtmlParsingError
162            })?;
163        let cookie = Some((cookie.name().to_string(), cookie.value().to_string()));
164
165        // Store on client instance field
166        *self.cookie.borrow_mut() = cookie;
167        println!("cookie = {:?}", self.cookie);
168
169        Ok(())
170    }
171    
172    pub fn item(&self, id: Id) -> Result<Listing, Box<dyn Error>> {
173        let url = format!("https://news.ycombinator.com/item?id={}", id);
174        let req = self.http_client.get(&url);
175        log::debug!("Send GET request to {:?}", url);
176        let resp = req.send()?;
177        let status = resp.status().as_u16();
178        if status != 200 {
179            let err = HttpError {
180                url: resp.url().to_string(),
181                code: status,
182            };
183            log::error!("Received non-200 response: {:?}", err);
184            return Err(Box::new(HnError::HttpError(err)));
185        }
186        log::debug!("Received 200 response from {:?}", url);
187
188        let text = resp.text()?;
189        let html = Html::parse_document(&text);
190
191        // Note: There is an assumption here that given an item ID, we should
192        // only extract one listing from a page. Therefore, we can simply pop once
193        // from the Vec obtained by extract listings.
194
195        let item = ListingsParser::parse(&html)?
196            .pop()
197            .ok_or(format!("Did not find item {}", id))?;
198
199        Ok(item)
200    }
201
202    pub fn thread(&self, id: Id) -> Result<Thread, Box<dyn Error>> {
203        log::debug!("HTML client attempting comments for id = {:?}", id);
204        let url = format!("https://news.ycombinator.com/item?id={}", id);
205        let req = self.http_client.get(&url);
206        let resp = req.send()?;
207        let text = resp.text()?;
208        let html = Html::parse_document(&text);
209        let comments = CommentsParser::parse(&html)?;
210        let comments = create_comment_tree(comments);
211        let listings = ListingsParser::parse(&html)?;
212        if listings.len() > 1 {
213            log::warn!("Parsed multiple listings for a thread, where only 1 is expected");
214        }
215        let listing = listings.into_iter()
216            .next()
217            .ok_or_else(|| {
218                log::error!("Succesfully parsed HTML, but found no listings");
219                HnError::HtmlParsingError
220            })?;
221        let thread = Thread { listing, comments };
222        
223        Ok(thread)
224    }
225
226    pub fn news(&self) -> Result<Vec<Listing>, Box<dyn Error>> {
227        self.listings("https://news.ycombinator.com/news")
228    }
229
230    pub fn past(&self, date: Date) -> Result<Vec<Listing>, Box<dyn Error>> {
231        let url = format!("https://news.ycombinator.com/front?day={}-{}-{}",
232            date.0, date.1, date.2);
233
234        self.listings(&url)
235    }
236
237    /// Retrieve a page of HackerNews Listings, such as that delivered from:
238    /// * `https://news.ycombinator.com/`
239    /// * `https://news.ycombinator.com/newest`
240    /// * `https://news.ycombinator.com/front`
241    /// * `https://news.ycombinator.com/newcomments`
242    /// * `https://news.ycombinator.com/ask`
243    /// * `https://news.ycombinator.com/show`
244    /// * `https://news.ycombinator.com/jobs`
245    pub fn listings(&self, url: &str) -> Result<Vec<Listing>, Box<dyn Error>> {
246        let req = self.http_client.get(url);
247        let resp = req.send()?;
248        let text = resp.text()?;
249        let html = Html::parse_document(&text);
250        let listings = ListingsParser::parse(&html)?;
251
252        Ok(listings)
253    }
254}
255
256
257#[cfg(test)]
258mod tests {
259
260    use super::*;
261
262    use crate::util::setup;
263
264    #[test]
265    fn test_news() -> Result<(), Box<dyn Error>> {
266        setup();
267        let client = Client::new();
268        let listings = client.news()?;
269        log::info!("Successfully called Client::news()");
270        log::trace!("Listings output from Client::news() = {:?}", listings);
271
272        Ok(())
273    }
274
275    #[test]
276    fn test_item() -> Result<(), Box<dyn Error>> {
277        setup();
278        let client = Client::new();
279        let item = client.item(25925926)?;
280        log::debug!("test_item item = {:#?}", item);
281
282        Ok(())
283    }
284
285    #[test]
286    fn test_comments() -> Result<(), Box<dyn Error>> {
287        setup();
288        let client = Client::new();
289        let comments = client.thread(100)?;
290        log::debug!("comments = {:?}", comments);
291
292        Ok(())
293    }
294
295    #[test]
296    fn test_login() -> Result<(), Box<dyn Error>> {
297        setup();
298        let user: String = match std::env::var("HN_USER") {
299            Ok(user) => user,
300            Err(_) => {
301                log::warn!("login test unable to retrieve Hacker News username from \
302                environment variable $HN_USER. Omitting test.");
303                return Ok(());
304            }
305        };
306
307        let pwd: String = match std::env::var("HN_PASS") {
308            Ok(pwd) => pwd,
309            Err(_) => {
310                log::warn!("login test unable to retrieve Hacker News password from \
311                environment variable $HN_PASS. Omitting test.");
312                return Ok(());
313            }
314        };
315        
316        let client = Client::new();
317        client.login(&user, &pwd)?;
318
319        Ok(())
320    }
321
322}