use crate::parse::{element::Extraction, remove_surrounding_whitespace};
use super::*;
pub struct PropertiesParser {
item: ElementPtr,
properties: Properties,
base_url: Url,
}
trait ImpliedPropertyExtraction {
fn can_extract(properties: &Properties) -> bool;
fn extract(self, base_url: &url::Url) -> Option<PropertyValue>;
}
struct ImpliedUrlExtractor(ElementPtr);
const IMPLIED_URL_TAGS: [&str; 2] = ["a", "area"];
impl ImpliedUrlExtractor {
fn get_first_level_element(&self) -> Option<Node> {
let node = self.0.node.to_owned();
if node.root_classes().is_empty() {
return None;
}
if !IMPLIED_URL_TAGS.contains(&node.tag()) {
return None;
}
Some(node)
}
fn get_second_level_element(&self) -> Option<Node> {
let parent_node = self.0.node.to_owned();
if parent_node.root_classes().is_empty() {
return None;
}
let child_elems = parent_node.elements();
let elems: Vec<_> = child_elems
.iter()
.filter(|node| IMPLIED_URL_TAGS.contains(&node.tag()) && node.root_classes().is_empty())
.cloned()
.collect();
if elems.len() != 1 {
None
} else {
elems.first().cloned()
}
}
fn get_third_level_element(&self) -> Option<Node> {
let node = self.0.node.to_owned();
if node.root_classes().is_empty() {
return None;
}
let child_elems = node.elements();
if child_elems.len() != 1 {
return None;
}
let child_elem = &child_elems[0];
if !child_elem.root_classes().is_empty() {
return None;
}
let elems: Vec<_> = child_elem
.elements()
.into_iter()
.filter(|node| IMPLIED_URL_TAGS.contains(&node.tag()) && node.root_classes().is_empty())
.collect();
if elems.len() != 1 {
return None;
}
elems.first().cloned()
}
}
impl ImpliedPropertyExtraction for ImpliedUrlExtractor {
#[tracing::instrument(level = "trace", skip(self), ret)]
fn extract(self, base_url: &url::Url) -> Option<PropertyValue> {
[
self.get_first_level_element(),
self.get_second_level_element(),
self.get_third_level_element(),
]
.into_iter()
.flatten()
.find_map(|node| {
node.attr("href").map(|href_attr_value| {
if let Ok(u) = base_url.join(&href_attr_value) {
PropertyValue::Url(UrlValue::new(u))
} else {
PropertyValue::Plain(TextValue::new(href_attr_value.to_string()))
}
})
})
.filter(non_empty_property_value)
}
#[tracing::instrument(level = "trace", ret)]
fn can_extract(properties: &Properties) -> bool {
!properties.contains_key("url")
&& !properties.values().flatten().any(|property_value| {
matches!(
property_value,
PropertyValue::Url(_) | PropertyValue::Image(_)
)
})
}
}
struct ImpliedPhotoExtractor(ElementPtr);
const IMPLIED_PHOTO_TAGS: [&str; 2] = ["img", "object"];
impl ImpliedPhotoExtractor {
fn get_first_level_element(&self) -> Option<Node> {
let node = self.0.node.to_owned();
if node.root_classes().is_empty() {
return None;
}
if !IMPLIED_PHOTO_TAGS.contains(&node.tag()) {
return None;
}
Some(node)
}
fn get_second_level_element(&self) -> Option<Node> {
let parent_node = self.0.node.to_owned();
if parent_node.root_classes().is_empty() {
return None;
}
let child_elems = parent_node.elements();
let elems: Vec<_> = child_elems
.iter()
.filter(|node| IMPLIED_PHOTO_TAGS.contains(&node.tag()))
.cloned()
.collect();
if elems.len() != 1 {
None
} else {
elems.first().cloned()
}
}
fn get_third_level_element(&self) -> Option<Node> {
let node = self.0.node.to_owned();
if node.root_classes().is_empty() {
return None;
}
let child_elems = node.elements();
if child_elems.len() != 1 {
return None;
}
let child_elem = &child_elems[0];
let elems: Vec<_> = child_elem
.elements()
.into_iter()
.filter(|node| {
IMPLIED_PHOTO_TAGS.contains(&node.tag()) && node.root_classes().is_empty()
})
.collect();
if elems.len() != 1 {
return None;
}
elems.first().cloned()
}
}
impl ImpliedPropertyExtraction for ImpliedPhotoExtractor {
fn extract(self, base_url: &url::Url) -> Option<PropertyValue> {
[
self.get_first_level_element(),
self.get_second_level_element(),
self.get_third_level_element(),
]
.into_iter()
.flatten()
.find_map(|node| {
let src_attr_value = extract_img_element(&node, base_url);
let data_attr_str = node
.attr("data")
.filter(non_empty_string)
.filter(|_| node.tag() == "object")
.map(|data_attr_value| {
if let Ok(u) = base_url.join(&data_attr_value) {
PropertyValue::Url(UrlValue::new(u))
} else {
PropertyValue::Plain(TextValue::new(data_attr_value.to_string()))
}
});
src_attr_value.or(data_attr_str)
})
.filter(non_empty_property_value)
}
fn can_extract(properties: &Properties) -> bool {
!properties.contains_key("photo")
&& !properties.values().flatten().any(|property_value| {
matches!(
property_value,
PropertyValue::Url(_)
| PropertyValue::Fragment { .. }
| PropertyValue::Item(_)
| PropertyValue::Image(_)
)
})
}
}
struct ImpliedNameExtractor(ElementPtr);
const IMPLIED_NAME_TAGS: [&str; 3] = ["img", "area", "abbr"];
const IMPLIED_NAME_TAGS_TO_ATTR: [(&str, &str); 3] =
[("img", "alt"), ("area", "alt"), ("abbr", "title")];
impl ImpliedNameExtractor {
fn get_first_level_element(&self) -> Option<Node> {
let node = self.0.node.to_owned();
if node.root_classes().is_empty() {
return None;
}
if !IMPLIED_NAME_TAGS.contains(&node.tag()) && node.tag() != "area" {
return None;
}
Some(node)
}
fn attr_name(desired_tag: &str) -> Option<String> {
IMPLIED_NAME_TAGS_TO_ATTR
.iter()
.find_map(|(tag, attr)| Some(attr.to_string()).filter(|_| tag == &desired_tag))
}
fn require_attr_in_node(node: &Node) -> bool {
IMPLIED_NAME_TAGS.contains(&node.tag()) && node.root_classes().is_empty() && {
if let Some(attr) = Self::attr_name(node.tag()) {
node.attr(&attr).filter(non_empty_string).is_some()
} else {
false
}
}
}
fn get_second_level_element(&self) -> Option<Node> {
let node = self.0.node.to_owned();
if node.root_classes().is_empty() {
return None;
}
let node_elems = node.elements();
let elems: Vec<_> = node_elems
.iter()
.filter(|&x| Self::require_attr_in_node(x))
.cloned()
.collect();
if node_elems.len() != 1 {
return None;
}
elems
.first()
.cloned()
.filter(|elem| elem.root_classes().is_empty())
}
fn get_third_level_element(&self) -> Option<Node> {
let node = self.0.node.to_owned();
if node.root_classes().is_empty() {
return None;
}
let elems: Vec<_> = node
.elements()
.into_iter()
.filter(|node| node.root_classes().is_empty())
.collect();
if elems.len() != 1 {
return None;
}
let child_elem = &elems[0];
let child_elems: Vec<_> = child_elem
.elements()
.into_iter()
.filter(Self::require_attr_in_node)
.collect();
if child_elems.len() != 1 {
return None;
}
child_elems.first().cloned()
}
}
impl ImpliedPropertyExtraction for ImpliedNameExtractor {
fn extract(self, base_url: &url::Url) -> Option<PropertyValue> {
[
self.get_first_level_element(),
self.get_second_level_element(),
self.get_third_level_element(),
]
.into_iter()
.flatten()
.find_map(|node| {
let attr_name = Self::attr_name(node.tag())?;
node.attr(&attr_name)
.map(|s| PropertyValue::Plain(TextValue::new(s)))
})
.filter(non_empty_property_value)
.or_else(|| {
self.0
.node
.text_content(base_url)
.ok()
.map(|Extraction { text, .. }| text)
.map(remove_surrounding_whitespace)
.map(|s| PropertyValue::Plain(TextValue::new(s)))
.filter(non_empty_property_value)
})
}
fn can_extract(properties: &Properties) -> bool {
!properties.contains_key("name")
&& !properties.values().flatten().any(|stored_value| {
matches!(
stored_value,
PropertyValue::Plain(_)
| PropertyValue::Fragment { .. }
| PropertyValue::Item(_)
)
})
}
}
#[allow(dead_code)]
impl PropertiesParser {
pub fn new(item: ElementPtr, properties: Properties, base_url: &Url) -> Self {
Self {
item,
properties,
base_url: base_url.to_owned(),
}
}
#[doc(hidden)]
#[tracing::instrument(level = "trace", skip(self), ret)]
fn can_imply_url(&self) -> bool {
!self.properties.contains_key("url")
&& !self.properties.values().flatten().any(|v| {
matches!(
v,
PropertyValue::Url(_) | PropertyValue::Fragment { .. } | PropertyValue::Item(_)
)
})
}
#[doc(hidden)]
#[tracing::instrument(level = "trace", skip(elem), ret)]
fn find_direct_only_child(elem: &swc_html_ast::Element) -> Option<Node> {
let elems = Node {
elem: elem.to_owned(),
}
.elements();
if elems.len() != 1 {
return None;
}
let node = elems[0].to_owned();
if !node.root_classes().is_empty() {
return None;
}
Some(node)
}
#[doc(hidden)]
#[tracing::instrument(level = "trace", ret)]
fn find_nested_direct_only_of_type(
element: &swc_html_ast::Element,
tag_names: &[&str],
) -> Option<Node> {
let direct_child = Self::find_direct_only_child(element)?;
tag_names.iter().find_map(|tag_name| {
direct_child
.elements()
.into_iter()
.filter_map(|node| {
if !&node.root_classes().is_empty() {
None
} else {
Some(node)
}
})
.find(|node| node.tag() == *tag_name)
})
}
#[doc(hidden)]
#[tracing::instrument(level = "trace", skip(node), ret)]
fn scan_for_first_of_element(node: &Node, tag_names: &[&str]) -> Option<Node> {
if tag_names.contains(&node.tag()) {
Some(node.to_owned())
} else {
node.elem
.children
.iter()
.filter_map(|child| {
if let Child::Element(elem) = child {
Some(Node {
elem: elem.to_owned(),
})
} else {
None
}
})
.find_map(|child_node| {
if child_node.mf_classes().is_empty()
&& child_node.property_classes().is_empty()
{
Self::scan_for_first_of_element(&child_node, tag_names)
} else {
None
}
})
}
}
#[doc(hidden)]
#[tracing::instrument(level = "trace", skip(node), ret)]
fn imply_url(node: &Node, base_url: &Url) -> Option<PropertyValue> {
Self::scan_for_first_of_element(node, &["a", "area"])
.and_then(|node| node.attr("href"))
.map(|url_like_str| {
if let Ok(url) = base_url.join(&url_like_str) {
PropertyValue::Url(UrlValue::new(url))
} else {
PropertyValue::Plain(TextValue::new(url_like_str))
}
})
}
#[doc(hidden)]
#[tracing::instrument(level = "trace", skip(self), ret)]
fn implied_url(&self) -> Option<PropertyValue> {
if self.can_imply_url() {
if let Some(value) = Self::imply_url(&self.item.node, &self.base_url) {
Some(value)
} else if let Some(value) = Self::find_direct_only_child(&self.item.node.elem)
.as_ref()
.and_then(|node| Self::imply_url(node, &self.base_url))
{
Some(value)
} else {
Self::find_nested_direct_only_of_type(&self.item.node.elem, &["a", "area"])
.as_ref()
.and_then(|node| Self::imply_url(node, &self.base_url))
}
} else {
None
}
}
#[tracing::instrument(level = "trace", skip(self), err)]
pub(crate) fn extract_implied(self) -> Result<Properties, Error> {
let mut properties = Properties::default();
if ImpliedNameExtractor::can_extract(&self.properties) {
if let Some(value) =
ImpliedNameExtractor(Arc::clone(&self.item)).extract(&self.base_url)
{
properties.insert("name".to_string(), vec![value]);
}
}
if ImpliedPhotoExtractor::can_extract(&self.properties) {
if let Some(value) =
ImpliedPhotoExtractor(Arc::clone(&self.item)).extract(&self.base_url)
{
properties.insert("photo".to_string(), vec![value]);
}
}
if ImpliedUrlExtractor::can_extract(&self.properties) {
if let Some(value) = ImpliedUrlExtractor(Arc::clone(&self.item)).extract(&self.base_url)
{
properties.insert("url".to_string(), vec![value]);
}
}
Ok(properties)
}
}
#[cfg(test)]
mod test;