rss-funnel 0.0.5

A composable feed processing pipeline
use ego_tree::{NodeId, NodeRef};
use scraper::{Html, Selector};

const RELATIVE_URL_PROPERTIES: [(&str, &str); 3] = [
  ("*[href]", "href"),
  ("*[src]", "src"),
  ("*[srcset]", "srcset"),
];

pub fn convert_relative_url(html: &mut Html, base_url: &str) {
  use html5ever::{namespace_url, ns, LocalName, QualName};
  lazy_static::lazy_static! {
    static ref SELECTORS: Vec<(Selector, &'static str)> = {
      RELATIVE_URL_PROPERTIES
        .iter()
        .map(|(selector, attr)| (Selector::parse(selector).expect("bad selector"), *attr))
        .collect()
    };
  }

  let Ok(base_url) = url::Url::parse(base_url) else {
    return;
  };

  for (selector, attr) in SELECTORS.iter() {
    let node_ids = html.select(selector).map(|e| e.id()).collect::<Vec<_>>();
    for node_id in node_ids {
      let mut node = html.tree.get_mut(node_id).expect("unreachable");

      let scraper::Node::Element(elem) = node.value() else {
        continue;
      };

      let attr_name = QualName::new(None, ns!(), LocalName::from(*attr));
      let Some(attr_value) = elem.attrs.get_mut(&attr_name) else {
        continue;
      };

      let Ok(url) = base_url.join(attr_value) else {
        continue;
      };

      attr_value.clear();
      attr_value.push_slice(url.as_str());
    }
  }
}

pub fn html_body(html: &str) -> String {
  Html::parse_document(html)
    .select(&Selector::parse("body").unwrap())
    .next()
    .map(|body| body.inner_html().trim().to_string())
    .unwrap_or_else(|| html.to_string())
}

pub fn fragment_root_node_id(mut node: NodeRef<'_, scraper::Node>) -> NodeId {
  let val = node.value();
  if val.is_fragment() || val.is_document() {
    node = node.first_child().unwrap();
    return fragment_root_node_id(node);
  }

  if val.as_element().is_some_and(|e| e.name() == "html") {
    node = node.first_child().unwrap();
    return fragment_root_node_id(node);
  }

  node.id()
}

#[cfg(test)]
mod test {
  #[test]
  fn test_html_body() {
    let html = r#"
      <html>
        <head>
          <title>Test</title>
        </head>
        <body>
          <p>Test</p>
        </body>
      </html>
    "#;

    let expected = r#"<p>Test</p>"#;

    assert_eq!(super::html_body(html), expected);
  }
}