use dom_query::{Document, Matcher, Node, Selection};
use tendril::StrTendril;
use crate::errors::ParseError;
use crate::sanitization::SanitizeOption;
use super::config::{CastType, Config};
use super::pipeline::Pipeline;
use super::value::{InnerMap, Value};
const INDEX_FIELD: &str = "index";
const EXTRACT_TEXT: &str = "text";
const EXTRACT_INNER_TEXT: &str = "inner_text";
const EXTRACT_IMMEDIATE_TEXT: &str = "immediate_text";
const EXTRACT_HTML: &str = "html";
const EXTRACT_INNER_HTML: &str = "inner_html";
#[derive(Debug)]
pub struct Finder {
name: Box<str>,
extract: Box<str>,
cast: CastType,
join_sep: Box<str>,
many: bool,
enumerate: bool,
inherit: bool,
parent: bool,
first_occurrence: bool,
remove_selection: bool,
flatten: bool,
children: Vec<Finder>,
matcher: Option<Matcher>,
sanitize_policy: SanitizeOption,
pipeline: Option<Pipeline>,
}
impl Finder {
pub fn new(config: &Config) -> Result<Finder, ParseError> {
Finder::from_config(config, true)
}
fn from_config(config: &Config, is_root: bool) -> Result<Finder, ParseError> {
config.validate()?;
let base_path = config.base_path.as_str();
let matcher = if !base_path.is_empty() {
Matcher::new(base_path).ok()
} else {
None
};
if matcher.is_none() && (is_root || !config.inherit) {
return Err(ParseError::RequireMatcher);
}
let pipeline = if !config.pipeline.is_empty() {
Some(Pipeline::new(&config.pipeline)?)
} else {
None
};
let mut p = Finder {
name: config.name.clone().into(),
extract: config.extract.clone().into(),
cast: config.cast,
join_sep: config.join_sep.clone().into(),
many: config.many,
enumerate: config.enumerate,
inherit: config.inherit,
parent: config.parent,
first_occurrence: config.first_occurrence,
remove_selection: config.remove_selection,
flatten: config.flatten,
children: Vec::new(),
matcher,
sanitize_policy: config.sanitize_policy,
pipeline,
};
for inline_config in config.children.iter() {
p.children.push(Finder::from_config(inline_config, false)?);
}
Ok(p)
}
fn get_matcher(&self) -> &Matcher {
match self.matcher {
Some(ref m) => m,
None => {
panic!("no matcher")
}
}
}
pub fn parse(&self, html: &str) -> Value {
let doc = Document::from(html);
self.parse_document(&doc)
}
pub fn parse_document(&self, doc: &Document) -> Value {
let sel = Selection::from(doc.root());
let val = self.parse_value(&sel);
let mut m: InnerMap = InnerMap::default();
m.insert(self.name.to_string(), val);
Value::Object(m)
}
pub fn parse_value(&self, root: &Selection) -> Value {
let sel: Selection = if self.inherit {
root.clone()
} else if self.parent {
root.select_matcher(self.get_matcher()).parent()
} else if self.many {
root.select_matcher(self.get_matcher())
} else {
root.select_single_matcher(self.get_matcher())
};
if !sel.exists() {
return Value::Null;
}
let has_children = !self.children.is_empty();
let v = match (has_children, self.many) {
(true, true) => self.parse_children_to_slice_maps(&sel),
(true, false) => self.parse_children_to_map(&sel),
(false, true) => {
let tmp_res: Vec<String> = sel
.nodes()
.iter()
.filter_map(|item| self.handle_data(item))
.collect();
if !self.join_sep.is_empty() {
Value::from(tmp_res.join(&self.join_sep))
} else {
Value::from_iter(tmp_res.into_iter().map(|it| cast_value(it, self.cast)))
}
}
_ => {
if let Some(tmp_val) = sel.nodes().first().and_then(|n| self.handle_data(n)) {
cast_value(tmp_val, self.cast)
} else {
Value::Null
}
}
};
if self.remove_selection {
sel.remove();
}
v
}
fn handle_data(&self, node: &Node) -> Option<String> {
self.extract_data(node).map(|extracted| {
let extracted = extracted.to_string();
if let Some(ref pipeline) = self.pipeline {
pipeline.handle(extracted)
} else {
extracted
}
})
}
fn children_to_map(&self, element: &Selection) -> InnerMap {
let mut m = InnerMap::default();
for inline in self.children.iter() {
let v = inline.parse_value(element);
if v.is_empty() {
continue;
}
if inline.flatten {
if let Value::Object(obj) = v {
m.extend(obj);
} else {
m.insert(inline.name.to_string(), v);
}
} else {
m.insert(inline.name.to_string(), v);
}
if self.first_occurrence {
break;
}
}
m
}
fn parse_children_to_map(&self, element: &Selection) -> Value {
Value::Object(self.children_to_map(element))
}
fn parse_children_to_slice_maps(&self, selection: &Selection) -> Value {
let mut values: Vec<InnerMap> = Vec::new();
for item in selection.iter() {
let m: InnerMap = self.children_to_map(&item);
if !m.is_empty() {
values.push(m);
}
}
if self.enumerate {
for (i, item) in values.iter_mut().enumerate() {
item.insert(INDEX_FIELD.to_string(), Value::Int(i as i64));
}
}
Value::from_iter(values.into_iter().map(Value::Object))
}
#[inline(always)]
fn extract_data(&self, node: &Node) -> Option<StrTendril> {
let extract_type = self.extract.as_ref();
match self.extract.as_ref() {
EXTRACT_TEXT => Some(node.text()),
EXTRACT_INNER_TEXT | EXTRACT_IMMEDIATE_TEXT => Some(node.immediate_text()),
EXTRACT_HTML => self.sanitize_policy.clean_html(node),
EXTRACT_INNER_HTML => self.sanitize_policy.clean_inner_html(node),
_ => node.attr(extract_type),
}
}
}
fn cast_value(s: String, cast: CastType) -> Value {
match cast {
CastType::Bool => Value::from(!s.is_empty()),
CastType::Int => Value::from(s.parse::<i64>().unwrap_or(0)),
CastType::Float => Value::from(s.parse::<f64>().unwrap_or(0.0)),
_ => Value::from(s),
}
}
impl TryFrom<Config> for Finder {
type Error = ParseError;
fn try_from(config: Config) -> Result<Self, Self::Error> {
Finder::new(&config)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn create_finder_success() {
let cfg_yml: &str = r"
name: root
base_path: html
children:
- name: links
base_path: a[href]
many: true
children:
- name: link
inherit: true
extract: href
- name: title
inherit: true
extract: text
- name: domain
inherit: true
extract: href
pipeline: [[regex, 'https?://([a-zA-Z0-9.-]+)/']]
";
let finder: Result<Finder, _> = Config::from_yaml(cfg_yml).unwrap().try_into();
assert!(finder.is_ok());
}
#[test]
fn create_finder_inherit_root_fail() {
let cfg_yml: &str = r"
name: root
inherit: true
extract: text
";
let cfg = Config::from_yaml(cfg_yml).unwrap();
let finder = Finder::new(&cfg);
assert!(finder.is_err());
}
#[test]
fn finder_pipeline_missing_arguments() {
let cfg_yml: &str = r"
name: root
base_path: html
children:
- name: links
base_path: a[href]
many: true
children:
- name: domain
inherit: true
extract: href
pipeline: [[regex]]
";
let cfg = Config::from_yaml(cfg_yml).unwrap();
let finder = Finder::new(&cfg);
assert!(finder.is_err());
}
#[test]
fn finder_pipeline_non_existing_proc() {
let cfg_yml: &str = r"
name: root
base_path: html
children:
- name: all_links
base_path: a[href]
many: true
pipeline: [[non_existing_proc]]
";
let cfg = Config::from_yaml(cfg_yml).unwrap();
let finder = Finder::new(&cfg);
assert!(finder.is_err());
}
}