use std::error::Error;
use std::collections::VecDeque;
use log;
use scraper::Html;
use scraper::ElementRef;
use scraper::Selector;
use lazy_static::lazy_static;
use crate::model::Comment;
use crate::error::HnError;
use crate::model::Id;
use crate::parser;
use crate::parser::HtmlParse;
const COMMENT_INDENT_INCR: u32 = 40;
lazy_static! {
static ref QS_COMMENT_TABLE: Selector = Selector::parse("table.comment-tree").unwrap();
static ref QS_COMMENT: Selector = Selector::parse("tr.athing.comtr").unwrap();
static ref QS_COMMENT_DEAD: Selector = Selector::parse("div.comment").unwrap();
static ref QS_COMMENT_TEXT: Selector = Selector::parse("span.commtext").unwrap();
static ref QS_COMMENT_MORE_TEXT: Selector = Selector::parse("span.commtext p").unwrap();
static ref QS_COMMENT_USER: Selector = Selector::parse("a.hnuser").unwrap();
static ref QS_COMMENT_INDENT: Selector = Selector::parse("td.ind img").unwrap();
}
pub struct CommentsParser;
impl HtmlParse for CommentsParser {
type Item = Vec<Comment>;
fn parse(html: &Html) -> Result<Self::Item, Box<dyn Error>> {
let mut comments = Vec::new();
let root = match Self::query_comment_root(html)? {
Some(root) => root,
None => {
return Ok(comments);
}
};
for node in root.select(&QS_COMMENT) {
let id = Self::parse_id(&node)?;
log::debug!("Parsing comment id={:?}", id);
let dead = Self::parse_dead_flag(&node, id)?;
log::debug!("Comment id={:?} is dead={:?}", id, dead);
let text = match dead {
true => None,
false => Self::parse_text(&node, id)?,
};
let user = Self::parse_user(&node, id)?;
let indent = Self::parse_indent(&node, id)?;
let children = Vec::new();
comments.push(Comment {
user,
id,
text,
indent,
dead,
children
});
}
Ok(comments)
}
}
impl CommentsParser {
fn query_comment_root(html: &Html) -> Result<Option<ElementRef>, Box<dyn Error>> {
let root = html.select(&QS_COMMENT_TABLE)
.next();
Ok(root)
}
fn parse_id(node: &ElementRef) -> Result<Id, Box<dyn Error>> {
let id = node.value()
.id()
.ok_or_else(|| {
log::error!("Failed to find id for comment; html = '{:?}'", node.html());
HnError::HtmlParsingError
})?
.parse::<Id>()?;
Ok(id)
}
fn parse_dead_flag(node: &ElementRef, id: Id) -> Result<bool, Box<dyn Error>> {
let comment_div = node.select(&QS_COMMENT_DEAD)
.next()
.ok_or_else(|| {
log::error!("Failed to find node 'div.comment' for id = {:?}", id);
HnError::HtmlParsingError
})?;
match comment_div.text().next() {
None => Ok(false),
Some(text) => if text.contains("[flagged]") {
Ok(true)
} else {
Ok(false)
},
}
}
fn parse_text(node: &ElementRef, id: Id) -> Result<Option<String>, Box<dyn Error>> {
let text_node = match node.select(&QS_COMMENT_TEXT).next() {
Some(text_node) => text_node,
None => {
log::warn!("Did not find comment text node for id = {}", id);
return Ok(None);
}
};
let mut text = match text_node.text().next() {
Some(text) => text.to_string(),
None => {
log::warn!("Failed to extract inner text for comment id = {}", id);
return Ok(None);
}
};
parser::append_more_text_nodes(node, &QS_COMMENT_MORE_TEXT, &mut text);
Ok(Some(text))
}
fn parse_user(node: &ElementRef, id: Id) -> Result<String, Box<dyn Error>> {
let user = node.select(&QS_COMMENT_USER)
.next()
.ok_or_else(|| {
log::error!("Failed to find the user node for comment id={}", id);
HnError::HtmlParsingError
})?
.text()
.next()
.ok_or_else(|| {
log::error!("Failed to extract user text for comment id = {}", id);
HnError::HtmlParsingError
})?
.to_string();
Ok(user)
}
fn parse_indent(node: &ElementRef, id: Id) -> Result<u32, Box<dyn Error>> {
let indent = node.select(&QS_COMMENT_INDENT)
.next()
.ok_or_else(|| {
log::error!("Failed to find indent node under comment id = {}", id);
HnError::HtmlParsingError
})?
.value()
.attr("width")
.ok_or_else(|| {
log::error!("Failed to extract indent width attribute from comment id = {}", id);
HnError::HtmlParsingError
})?
.parse::<u32>()?;
Ok(indent)
}
}
pub fn create_comment_tree(comments: Vec<Comment>) -> Vec<Comment> {
#[allow(clippy::comparison_chain)]
fn _create_comment_tree(q: &mut VecDeque<Comment>, parent: &mut Comment) {
let mut last: Option<&mut Comment> = None;
while let Some(c) = q.front() {
if c.indent == parent.indent + COMMENT_INDENT_INCR {
let c = q.pop_front().unwrap();
parent.children.push(c);
last = Some(parent.children.last_mut().unwrap());
}
else if c.indent > parent.indent + COMMENT_INDENT_INCR {
let next_parent = last.take()
.expect("Jumped a nesting level in comment node hierarchy");
_create_comment_tree(q, next_parent);
}
else {
return;
}
}
}
let mut q = VecDeque::from(comments);
let mut forest = Vec::new();
while let Some(root) = q.pop_front() {
forest.push(root);
let ptr = forest.last_mut().unwrap();
_create_comment_tree(&mut q, ptr);
}
forest
}