hacker_news/parser/
comments.rs1use std::error::Error;
2use std::collections::VecDeque;
3use log;
4use scraper::Html;
5use scraper::ElementRef;
6use scraper::Selector;
7use lazy_static::lazy_static;
8use crate::model::Comment;
9use crate::error::HnError;
10use crate::model::Id;
11use crate::parser;
12use crate::parser::HtmlParse;
13
14const COMMENT_INDENT_INCR: u32 = 40;
15
16lazy_static! {
17 static ref QS_COMMENT_TABLE: Selector = Selector::parse("table.comment-tree").unwrap();
19
20 static ref QS_COMMENT: Selector = Selector::parse("tr.athing.comtr").unwrap();
22
23 static ref QS_COMMENT_DEAD: Selector = Selector::parse("div.comment").unwrap();
25 static ref QS_COMMENT_TEXT: Selector = Selector::parse("span.commtext").unwrap();
26 static ref QS_COMMENT_MORE_TEXT: Selector = Selector::parse("span.commtext p").unwrap();
27 static ref QS_COMMENT_USER: Selector = Selector::parse("a.hnuser").unwrap();
28 static ref QS_COMMENT_INDENT: Selector = Selector::parse("td.ind img").unwrap();
29}
30
31
32pub struct CommentsParser;
33
34impl HtmlParse for CommentsParser {
35 type Item = Vec<Comment>;
36
37 fn parse(html: &Html) -> Result<Self::Item, Box<dyn Error>> {
38 let mut comments = Vec::new();
39
40 let root = match Self::query_comment_root(html)? {
41 Some(root) => root,
42 None => {
46 return Ok(comments);
48 }
49 };
50
51 for node in root.select(&QS_COMMENT) {
52 let id = Self::parse_id(&node)?;
53 log::debug!("Parsing comment id={:?}", id);
54 let dead = Self::parse_dead_flag(&node, id)?;
55 log::debug!("Comment id={:?} is dead={:?}", id, dead);
56 let text = match dead {
57 true => None,
58 false => Self::parse_text(&node, id)?,
59 };
60 let user = Self::parse_user(&node, id)?;
61 let indent = Self::parse_indent(&node, id)?;
62 let children = Vec::new();
63 comments.push(Comment {
64 user,
65 id,
66 text,
67 indent,
68 dead,
69 children
70 });
71 }
72
73 Ok(comments)
74 }
75}
76
77impl CommentsParser {
78
79 fn query_comment_root(html: &Html) -> Result<Option<ElementRef>, Box<dyn Error>> {
80 let root = html.select(&QS_COMMENT_TABLE)
83 .next();
84
85 Ok(root)
86 }
87
88 fn parse_id(node: &ElementRef) -> Result<Id, Box<dyn Error>> {
89 let id = node.value()
90 .id()
91 .ok_or_else(|| {
92 log::error!("Failed to find id for comment; html = '{:?}'", node.html());
93 HnError::HtmlParsingError
94 })?
95 .parse::<Id>()?;
96
97 Ok(id)
98 }
99
100 fn parse_dead_flag(node: &ElementRef, id: Id) -> Result<bool, Box<dyn Error>> {
101 let comment_div = node.select(&QS_COMMENT_DEAD)
102 .next()
103 .ok_or_else(|| {
104 log::error!("Failed to find node 'div.comment' for id = {:?}", id);
105 HnError::HtmlParsingError
106 })?;
107
108 match comment_div.text().next() {
109 None => Ok(false),
110 Some(text) => if text.contains("[flagged]") {
111 Ok(true)
112 } else {
113 Ok(false)
114 },
115 }
116 }
117
118 fn parse_text(node: &ElementRef, id: Id) -> Result<Option<String>, Box<dyn Error>> {
119
120 let text_node = match node.select(&QS_COMMENT_TEXT).next() {
129 Some(text_node) => text_node,
130 None => {
131 log::warn!("Did not find comment text node for id = {}", id);
132 return Ok(None);
133 }
134 };
135
136 let mut text = match text_node.text().next() {
146 Some(text) => text.to_string(),
147 None => {
148 log::warn!("Failed to extract inner text for comment id = {}", id);
149 return Ok(None);
150 }
151 };
152 parser::append_more_text_nodes(node, &QS_COMMENT_MORE_TEXT, &mut text);
153
154 Ok(Some(text))
155 }
156
157 fn parse_user(node: &ElementRef, id: Id) -> Result<String, Box<dyn Error>> {
158 let user = node.select(&QS_COMMENT_USER)
159 .next()
160 .ok_or_else(|| {
161 log::error!("Failed to find the user node for comment id={}", id);
162 HnError::HtmlParsingError
163 })?
164 .text()
165 .next()
166 .ok_or_else(|| {
167 log::error!("Failed to extract user text for comment id = {}", id);
168 HnError::HtmlParsingError
169 })?
170 .to_string();
171
172 Ok(user)
173 }
174
175 fn parse_indent(node: &ElementRef, id: Id) -> Result<u32, Box<dyn Error>> {
176 let indent = node.select(&QS_COMMENT_INDENT)
177 .next()
178 .ok_or_else(|| {
179 log::error!("Failed to find indent node under comment id = {}", id);
180 HnError::HtmlParsingError
181 })?
182 .value()
183 .attr("width")
184 .ok_or_else(|| {
185 log::error!("Failed to extract indent width attribute from comment id = {}", id);
186 HnError::HtmlParsingError
187 })?
188 .parse::<u32>()?;
189
190 Ok(indent)
191 }
192}
193
194pub fn create_comment_tree(comments: Vec<Comment>) -> Vec<Comment> {
195
196 #[allow(clippy::comparison_chain)]
197 fn _create_comment_tree(q: &mut VecDeque<Comment>, parent: &mut Comment) {
198 let mut last: Option<&mut Comment> = None;
199 while let Some(c) = q.front() {
200 if c.indent == parent.indent + COMMENT_INDENT_INCR {
201 let c = q.pop_front().unwrap();
202 parent.children.push(c);
203 last = Some(parent.children.last_mut().unwrap());
204 }
205 else if c.indent > parent.indent + COMMENT_INDENT_INCR {
206 let next_parent = last.take()
207 .expect("Jumped a nesting level in comment node hierarchy");
208 _create_comment_tree(q, next_parent);
209 }
210 else {
211 return;
212 }
213 }
214 }
215
216 let mut q = VecDeque::from(comments);
217 let mut forest = Vec::new();
218
219 while let Some(root) = q.pop_front() {
220 forest.push(root);
221 let ptr = forest.last_mut().unwrap();
222 _create_comment_tree(&mut q, ptr);
223 }
224
225 forest
226}
227