hacker_news/parser/
comments.rs

1use std::error::Error;
2use std::collections::VecDeque;
3use log;
4use scraper::Html;
5use scraper::ElementRef;
6use scraper::Selector;
7use lazy_static::lazy_static;
8use crate::model::Comment;
9use crate::error::HnError;
10use crate::model::Id;
11use crate::parser;
12use crate::parser::HtmlParse;
13
14const COMMENT_INDENT_INCR: u32 = 40;
15
16lazy_static! {
17    // Applied to root of HTML document
18    static ref QS_COMMENT_TABLE: Selector = Selector::parse("table.comment-tree").unwrap();
19    
20    // Applied to comment tree root (i.e. node `table.comment-tree`)
21    static ref QS_COMMENT: Selector = Selector::parse("tr.athing.comtr").unwrap();
22    
23    // Applied to comment node (i.e. node `tr.athing.comtr`)
24    static ref QS_COMMENT_DEAD: Selector = Selector::parse("div.comment").unwrap();
25    static ref QS_COMMENT_TEXT: Selector = Selector::parse("span.commtext").unwrap();
26    static ref QS_COMMENT_MORE_TEXT: Selector = Selector::parse("span.commtext p").unwrap();
27    static ref QS_COMMENT_USER: Selector = Selector::parse("a.hnuser").unwrap();
28    static ref QS_COMMENT_INDENT: Selector = Selector::parse("td.ind img").unwrap();
29}
30
31
32pub struct CommentsParser;
33
34impl HtmlParse for CommentsParser {
35    type Item = Vec<Comment>;
36
37    fn parse(html: &Html) -> Result<Self::Item, Box<dyn Error>> {
38        let mut comments = Vec::new();
39
40        let root = match Self::query_comment_root(html)? {
41            Some(root) => root,
42            // TODO: Is it possible there are other erroneous reasones the 'None' branch could get
43            // hit? It could be misleading if you get an empty Vec of comments if the HTML page
44            // itself was bad.
45            None => {
46                // If querying comment root gets no results, then this Id has no comments
47                return Ok(comments);
48            }
49        };
50
51        for node in root.select(&QS_COMMENT) {
52            let id = Self::parse_id(&node)?;
53            log::debug!("Parsing comment id={:?}", id);
54            let dead = Self::parse_dead_flag(&node, id)?;
55            log::debug!("Comment id={:?} is dead={:?}", id, dead);
56            let text = match dead {
57                true => None,
58                false => Self::parse_text(&node, id)?,
59            };
60            let user = Self::parse_user(&node, id)?;
61            let indent = Self::parse_indent(&node, id)?;
62            let children = Vec::new();
63            comments.push(Comment {
64                user,
65                id,
66                text,
67                indent,
68                dead,
69                children 
70            });
71        }
72
73        Ok(comments)
74    }
75}
76
77impl CommentsParser {
78
79    fn query_comment_root(html: &Html) -> Result<Option<ElementRef>, Box<dyn Error>> {
80        // Note: This uses the first comment table found. There shouldn't ever
81        // be more than one comment table; however, as is there is not an explicit check
82        let root = html.select(&QS_COMMENT_TABLE)
83            .next();
84
85        Ok(root)
86    }
87
88    fn parse_id(node: &ElementRef) -> Result<Id, Box<dyn Error>> {
89        let id = node.value()
90            .id()
91            .ok_or_else(|| {
92                log::error!("Failed to find id for comment; html = '{:?}'", node.html());
93                HnError::HtmlParsingError
94            })?
95            .parse::<Id>()?;
96
97        Ok(id)
98    }
99
100    fn parse_dead_flag(node: &ElementRef, id: Id) -> Result<bool, Box<dyn Error>> {
101        let comment_div = node.select(&QS_COMMENT_DEAD)
102        .next()
103        .ok_or_else(|| {
104            log::error!("Failed to find node 'div.comment' for id = {:?}", id);
105            HnError::HtmlParsingError
106        })?;
107
108        match comment_div.text().next() {
109            None => Ok(false),
110            Some(text) => if text.contains("[flagged]") {
111                Ok(true)
112            } else {
113                Ok(false)
114            },
115        }
116    }
117
118    fn parse_text(node: &ElementRef, id: Id) -> Result<Option<String>, Box<dyn Error>> {
119
120        // Select inner text from root of comment text node
121        // let text_node = node.select(&QS_COMMENT_TEXT)
122        //     .next()
123        //     .ok_or_else(|| {
124        //         log::error!("Failed to find comment text for id = {}", id);
125        //         HnError::HtmlParsingError
126        //     })?;
127
128        let text_node = match node.select(&QS_COMMENT_TEXT).next() {
129            Some(text_node) => text_node,
130            None => {
131                log::warn!("Did not find comment text node for id = {}", id);
132                return Ok(None);
133            }
134        };
135
136        // let mut text = text_node.text()
137        //     .next()
138        //     .ok_or_else(|| {
139        //         log::error!("Failed to extract inner text for comment id = {}", id);
140        //         let msg = format!("Failed to extract inner text for comment id = {}", id);
141        //         msg.as_str().to_owned()
142        //     })?
143        //     .to_string();
144
145        let mut text = match text_node.text().next() {
146            Some(text) => text.to_string(),
147            None => {
148                log::warn!("Failed to extract inner text for comment id = {}", id);
149                return Ok(None);
150            }
151        };
152        parser::append_more_text_nodes(node, &QS_COMMENT_MORE_TEXT, &mut text);
153
154        Ok(Some(text))
155    }
156
157    fn parse_user(node: &ElementRef, id: Id) -> Result<String, Box<dyn Error>> {
158        let user = node.select(&QS_COMMENT_USER)
159            .next()
160            .ok_or_else(|| {
161                log::error!("Failed to find the user node for comment id={}", id);
162                HnError::HtmlParsingError
163            })?
164            .text()
165            .next()
166            .ok_or_else(|| {
167                log::error!("Failed to extract user text for comment id = {}", id);
168                HnError::HtmlParsingError
169            })?
170            .to_string();
171
172        Ok(user)
173    }
174
175    fn parse_indent(node: &ElementRef, id: Id) -> Result<u32, Box<dyn Error>> {
176        let indent = node.select(&QS_COMMENT_INDENT)
177            .next()
178            .ok_or_else(|| {
179                log::error!("Failed to find indent node under comment id = {}", id);
180                HnError::HtmlParsingError
181            })?
182            .value()
183            .attr("width")
184            .ok_or_else(|| {
185                log::error!("Failed to extract indent width attribute from comment id = {}", id);
186                HnError::HtmlParsingError
187            })?
188            .parse::<u32>()?;
189
190        Ok(indent)
191    }
192}
193
194pub fn create_comment_tree(comments: Vec<Comment>) -> Vec<Comment> {
195
196    #[allow(clippy::comparison_chain)]
197    fn _create_comment_tree(q: &mut VecDeque<Comment>, parent: &mut Comment) {
198        let mut last: Option<&mut Comment> = None;
199        while let Some(c) = q.front() {
200            if c.indent == parent.indent + COMMENT_INDENT_INCR {
201                let c = q.pop_front().unwrap();
202                parent.children.push(c);
203                last = Some(parent.children.last_mut().unwrap());
204            }
205            else if c.indent > parent.indent + COMMENT_INDENT_INCR {
206                let next_parent = last.take()
207                    .expect("Jumped a nesting level in comment node hierarchy");
208                _create_comment_tree(q, next_parent);
209            }
210            else {
211                return;
212            }
213        }
214    }
215
216    let mut q = VecDeque::from(comments);
217    let mut forest = Vec::new();
218
219    while let Some(root) = q.pop_front() {
220        forest.push(root);
221        let ptr = forest.last_mut().unwrap();
222        _create_comment_tree(&mut q, ptr);
223    }
224
225    forest
226}
227