hacker_news/parser/
mod.rs

1use std::error::Error;
2use std::ops::Deref;
3use lazy_static::lazy_static;
4use regex::Regex;
5use scraper;
6use scraper::Html;
7use scraper::Selector;
8use scraper::ElementRef;
9use crate::error::HnError;
10
11pub mod comments;
12pub mod listings;
13
14// Re-exports parser namespaces for conveniant library ergonmics
15pub use crate::parser::comments::CommentsParser;
16pub use crate::parser::listings::ListingsParser;
17
18pub trait HtmlParse {
19    type Item;
20
21    fn parse(html: &Html) -> Result<Self::Item, Box<dyn Error>>;
22}
23
24// Query for an ancestor node at a given height
25fn ancestor<'a>(node: &'a ElementRef, height: u32) -> Option<ElementRef<'a>> {
26
27    // Note: Declaring `parent` outside the loop resolves an error
28    // regarding dropping to early; however, the compiler reports this as
29    // an unused assignment
30
31    let mut curr_node = Deref::deref(node);
32    #[allow(unused_assignments)]
33    let mut parent = curr_node.parent();
34    let mut i = 0;
35
36    while i < height {
37        parent = curr_node.parent();
38        curr_node = match parent {
39            Some(ref node_ref) => node_ref,
40            None => { return None; },
41        };
42        i += 1;
43    }
44
45    ElementRef::wrap(*curr_node)
46}
47
48// Search for any additional nodes of text, and append to buffer 
49fn append_more_text_nodes(node: &ElementRef, qs: &Selector, text: &mut String, ) {
50    for child in node.select(qs) {
51        match child.text().next() {
52            None => {
53                // This branch handles a <p> node with no inner text. With no inner
54                // text, there is nothing to append, and we simply continue
55                continue;
56            }
57            Some(more_text) => {
58                // We add a newline since we're concatenating <p> node text together
59                text.push('\n');
60                text.push_str(more_text);
61            },
62        }
63    }
64}
65
66lazy_static! {
67    static ref FNID_REGEX: Regex =  Regex::new(r#"<input.*value="(.+?)".*>"#).unwrap();
68}
69
70pub fn extract_fnid(el: &ElementRef) -> Result<String, Box<dyn Error>> {
71    let text = el.html();
72    let captures = match FNID_REGEX.captures(&text) {
73        Some(captures) => captures,
74        None => {
75            return Err(Box::new(HnError::HtmlParsingError));
76        }
77    };
78    let fnid = match captures.get(1) {
79        Some(fnid) => {
80            fnid.as_str().to_string()
81        },
82        None => {
83            return Err(Box::new(HnError::HtmlParsingError));
84        }
85    };
86
87    Ok(fnid)
88}
89