use anyhow::{Context, Result, bail};
use async_trait::async_trait;
use regex::Regex;
use scraper::{ElementRef, Html, Selector};
use super::{SiteContent, SiteMetadata, SiteProvider};
use crate::content::ContentRouter;
use crate::http_client::AcceleratedClient;
#[derive(Debug, Clone)]
pub struct CssExtractorConfig {
pub name: String,
pub url_pattern: String,
pub content_selector: String,
pub title_selector: Option<String>,
pub author_selector: Option<String>,
pub date_selector: Option<String>,
pub remove_selectors: Vec<String>,
}
pub struct CssExtractorProvider {
config: CssExtractorConfig,
url_regex: Regex,
content_sel: Selector,
title_sel: Option<Selector>,
author_sel: Option<Selector>,
date_sel: Option<Selector>,
remove_sels: Vec<Selector>,
static_name: &'static str,
}
impl CssExtractorProvider {
pub fn new(config: CssExtractorConfig) -> Result<Self> {
let url_regex = Regex::new(&config.url_pattern).with_context(|| {
format!(
"invalid URL pattern '{}' in CSS extractor '{}'",
config.url_pattern, config.name
)
})?;
let content_sel = compile_selector(&config.content_selector, &config.name, "content")?;
let title_sel = config
.title_selector
.as_deref()
.map(|s| compile_selector(s, &config.name, "title"))
.transpose()?;
let author_sel = config
.author_selector
.as_deref()
.map(|s| compile_selector(s, &config.name, "author"))
.transpose()?;
let date_sel = config
.date_selector
.as_deref()
.map(|s| compile_selector(s, &config.name, "published"))
.transpose()?;
let remove_sels = config
.remove_selectors
.iter()
.enumerate()
.map(|(i, s)| compile_selector(s, &config.name, &format!("remove[{i}]")))
.collect::<Result<Vec<_>>>()?;
let static_name: &'static str = Box::leak(config.name.clone().into_boxed_str());
Ok(Self {
config,
url_regex,
content_sel,
title_sel,
author_sel,
date_sel,
remove_sels,
static_name,
})
}
}
#[async_trait]
impl SiteProvider for CssExtractorProvider {
fn name(&self) -> &'static str {
self.static_name
}
fn matches(&self, url: &str) -> bool {
self.url_regex.is_match(url)
}
async fn extract(
&self,
url: &str,
_client: &AcceleratedClient,
_cookies: Option<&str>,
prefetched_html: Option<&[u8]>,
) -> Result<SiteContent> {
let html_bytes = prefetched_html.with_context(|| {
format!(
"CSS extractor '{}' requires pre-fetched HTML but none was provided for {url}",
self.config.name
)
})?;
let html_str = std::str::from_utf8(html_bytes)
.with_context(|| format!("HTML body for {url} is not valid UTF-8"))?;
let document = Html::parse_document(html_str);
let title = extract_text_opt(&document, self.title_sel.as_ref());
let author = extract_text_opt(&document, self.author_sel.as_ref());
let published = extract_text_opt(&document, self.date_sel.as_ref());
let content_html = build_content_html(&document, &self.content_sel, &self.remove_sels, url);
if content_html.is_empty() {
return Ok(SiteContent {
markdown: String::new(),
metadata: build_metadata(&self.config.name, url, title, author, published),
});
}
let router = ContentRouter::new();
let result = router
.convert_with_url(content_html.as_bytes(), "text/html", Some(url))
.with_context(|| {
format!(
"ContentRouter failed for CSS extractor '{}' on {url}",
self.config.name
)
})?;
Ok(SiteContent {
markdown: result.markdown,
metadata: build_metadata(&self.config.name, url, title, author, published),
})
}
}
fn compile_selector(selector: &str, extractor_name: &str, field: &str) -> Result<Selector> {
Selector::parse(selector).map_err(|e| {
anyhow::anyhow!(
"invalid CSS selector '{selector}' for field '{field}' \
in extractor '{extractor_name}': {e}"
)
})
}
fn extract_text_opt(document: &Html, selector: Option<&Selector>) -> Option<String> {
let sel = selector?;
let el = document.select(sel).next()?;
let text: String = el.text().collect::<Vec<_>>().join(" ");
let trimmed = text.trim().to_string();
if trimmed.is_empty() {
None
} else {
Some(trimmed)
}
}
fn serialise_element_filtered(element: ElementRef<'_>, remove_sels: &[Selector]) -> String {
if remove_sels.is_empty() {
return element.html();
}
let mut out = String::new();
serialise_node_recursive(element, remove_sels, &mut out);
out
}
fn serialise_node_recursive(element: ElementRef<'_>, remove_sels: &[Selector], out: &mut String) {
if remove_sels.iter().any(|s| s.matches(&element)) {
return;
}
out.push('<');
out.push_str(element.value().name());
for (attr, val) in element.value().attrs() {
use std::fmt::Write as _;
let _ = write!(out, " {attr}=\"{val}\"");
}
out.push('>');
for child in element.children() {
match child.value() {
scraper::node::Node::Text(t) => out.push_str(t),
scraper::node::Node::Element(_) => {
if let Some(child_el) = ElementRef::wrap(child) {
serialise_node_recursive(child_el, remove_sels, out);
}
}
_ => {}
}
}
out.push_str("</");
out.push_str(element.value().name());
out.push('>');
}
fn build_content_html(
document: &Html,
content_sel: &Selector,
remove_sels: &[Selector],
url: &str,
) -> String {
let mut fragments: Vec<String> = document
.select(content_sel)
.map(|el| serialise_element_filtered(el, remove_sels))
.collect();
fragments.retain(|f| !f.trim().is_empty());
if fragments.is_empty() {
return String::new();
}
format!(
"<!DOCTYPE html><html><head></head><body>\
<!-- extracted by nab CSS extractor from {url} -->\
{}\
</body></html>",
fragments.join("\n")
)
}
fn build_metadata(
name: &str,
url: &str,
title: Option<String>,
author: Option<String>,
published: Option<String>,
) -> SiteMetadata {
SiteMetadata {
author,
title,
published,
platform: format!("css:{name}"),
canonical_url: url.to_string(),
media_urls: Vec::new(),
engagement: None,
}
}
pub fn validate_config(config: &CssExtractorConfig) -> Result<()> {
if config.name.is_empty() {
bail!("CSS extractor name must not be empty");
}
if config.content_selector.is_empty() {
bail!("CSS extractor '{}' has no content.selector", config.name);
}
Selector::parse(&config.content_selector).map_err(|e| {
anyhow::anyhow!(
"CSS extractor '{}' has invalid content.selector '{}': {e}",
config.name,
config.content_selector
)
})?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::LazyLock;
static H1_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("h1").expect("static h1 selector"));
static H2_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("h2").expect("static h2 selector"));
static P_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("p").expect("static p selector"));
static ARTICLE_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("article").expect("static article selector"));
static NAV_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("nav").expect("static nav selector"));
static ASIDE_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("aside").expect("static aside selector"));
static MAIN_SELECTOR: LazyLock<Selector> =
LazyLock::new(|| Selector::parse("main").expect("static main selector"));
fn simple_config() -> CssExtractorConfig {
CssExtractorConfig {
name: "test-blog".to_string(),
url_pattern: r"myblog\.com".to_string(),
content_selector: "article".to_string(),
title_selector: Some("h1.title".to_string()),
author_selector: Some(".author".to_string()),
date_selector: Some("time".to_string()),
remove_selectors: vec![".sidebar".to_string(), "nav".to_string()],
}
}
fn make_provider(config: CssExtractorConfig) -> CssExtractorProvider {
CssExtractorProvider::new(config).expect("valid config should build provider")
}
#[test]
fn matches_url_that_satisfies_pattern() {
let p = make_provider(simple_config());
assert!(p.matches("https://myblog.com/post/hello-world"));
}
#[test]
fn does_not_match_url_outside_pattern() {
let p = make_provider(simple_config());
assert!(!p.matches("https://other.com/post/hello-world"));
}
#[test]
fn matches_case_sensitive_by_default() {
let p = make_provider(simple_config());
assert!(!p.matches("https://MYBLOG.COM/post"));
}
#[test]
fn case_insensitive_flag_in_pattern_works() {
let config = CssExtractorConfig {
url_pattern: r"(?i)myblog\.com".to_string(),
..simple_config()
};
let p = make_provider(config);
assert!(p.matches("https://MYBLOG.COM/post"));
assert!(p.matches("https://myblog.com/post"));
}
#[test]
fn provider_name_matches_config_name() {
let p = make_provider(simple_config());
assert_eq!(p.name(), "test-blog");
}
#[test]
fn rejects_invalid_url_pattern() {
let config = CssExtractorConfig {
url_pattern: r"[invalid".to_string(),
..simple_config()
};
assert!(CssExtractorProvider::new(config).is_err());
}
#[test]
fn rejects_invalid_content_selector() {
let config = CssExtractorConfig {
content_selector: "::invalid-pseudo".to_string(),
..simple_config()
};
assert!(CssExtractorProvider::new(config).is_err());
}
#[test]
fn rejects_invalid_remove_selector() {
let config = CssExtractorConfig {
remove_selectors: vec!["::bad".to_string()],
..simple_config()
};
assert!(CssExtractorProvider::new(config).is_err());
}
#[test]
fn rejects_invalid_title_selector() {
let config = CssExtractorConfig {
title_selector: Some("::bad".to_string()),
..simple_config()
};
assert!(CssExtractorProvider::new(config).is_err());
}
#[test]
fn accepts_config_with_no_optional_selectors() {
let config = CssExtractorConfig {
title_selector: None,
author_selector: None,
date_selector: None,
remove_selectors: vec![],
..simple_config()
};
assert!(CssExtractorProvider::new(config).is_ok());
}
#[test]
fn extract_text_returns_none_when_selector_absent() {
let doc = Html::parse_document("<html><body><p>Hello</p></body></html>");
assert!(extract_text_opt(&doc, None).is_none());
}
#[test]
fn extract_text_returns_text_of_first_match() {
let doc = Html::parse_document("<html><body><h1> Title Here </h1></body></html>");
assert_eq!(
extract_text_opt(&doc, Some(&H1_SELECTOR)),
Some("Title Here".to_string())
);
}
#[test]
fn extract_text_returns_none_when_no_element_matches() {
let doc = Html::parse_document("<html><body><h1>Only H1</h1></body></html>");
assert!(extract_text_opt(&doc, Some(&H2_SELECTOR)).is_none());
}
#[test]
fn extract_text_joins_inner_text_nodes() {
let doc = Html::parse_document("<p>Hello <strong>world</strong></p>");
let text = extract_text_opt(&doc, Some(&P_SELECTOR)).unwrap();
assert!(text.contains("Hello"));
assert!(text.contains("world"));
}
#[test]
fn serialise_keeps_all_content_when_no_remove_sels() {
let html = "<html><body><article><p>Keep</p><nav>Nav</nav></article></body></html>";
let doc = Html::parse_document(html);
let el = doc.select(&ARTICLE_SELECTOR).next().unwrap();
let out = serialise_element_filtered(el, &[]);
assert!(out.contains("Keep"));
assert!(out.contains("Nav"));
}
#[test]
fn serialise_strips_removed_element() {
let html = "<html><body><article><p>Keep</p><nav>Remove</nav></article></body></html>";
let doc = Html::parse_document(html);
let el = doc.select(&ARTICLE_SELECTOR).next().unwrap();
let out = serialise_element_filtered(el, std::slice::from_ref(&*NAV_SELECTOR));
assert!(out.contains("Keep"));
assert!(!out.contains("Remove"));
}
#[test]
fn serialise_strips_nested_children_of_removed() {
let html = "<html><body><article>\
<aside><a>Link1</a><a>Link2</a></aside>\
<p>Body</p>\
</article></body></html>";
let doc = Html::parse_document(html);
let el = doc.select(&ARTICLE_SELECTOR).next().unwrap();
let out = serialise_element_filtered(el, std::slice::from_ref(&*ASIDE_SELECTOR));
assert!(out.contains("Body"));
assert!(!out.contains("Link1"));
assert!(!out.contains("Link2"));
}
#[test]
fn content_html_returns_empty_when_selector_misses() {
let html = "<html><body><p>No article here</p></body></html>";
let doc = Html::parse_document(html);
let result = build_content_html(&doc, &ARTICLE_SELECTOR, &[], "https://example.com");
assert!(result.is_empty());
}
#[test]
fn content_html_captures_matched_element() {
let html = "<html><body><article><p>Article text</p></article></body></html>";
let doc = Html::parse_document(html);
let result = build_content_html(&doc, &ARTICLE_SELECTOR, &[], "https://example.com");
assert!(result.contains("Article text"));
assert!(result.contains("<html>"));
assert!(result.contains("<body>"));
}
#[test]
fn content_html_joins_multiple_matches() {
let html = "<html><body>\
<article><p>First</p></article>\
<article><p>Second</p></article>\
</body></html>";
let doc = Html::parse_document(html);
let result = build_content_html(&doc, &ARTICLE_SELECTOR, &[], "https://example.com");
assert!(result.contains("First"));
assert!(result.contains("Second"));
}
#[test]
fn content_html_wraps_in_document_for_content_router() {
let html = "<html><body><main><h1>Hello</h1><p>World</p></main></body></html>";
let doc = Html::parse_document(html);
let result = build_content_html(&doc, &MAIN_SELECTOR, &[], "https://example.com");
assert!(result.starts_with("<!DOCTYPE html>"));
}
#[test]
fn validate_config_rejects_empty_name() {
let config = CssExtractorConfig {
name: String::new(),
..simple_config()
};
assert!(validate_config(&config).is_err());
}
#[test]
fn validate_config_rejects_empty_selector() {
let config = CssExtractorConfig {
content_selector: String::new(),
..simple_config()
};
assert!(validate_config(&config).is_err());
}
#[test]
fn validate_config_rejects_invalid_selector() {
let config = CssExtractorConfig {
content_selector: "::bad-selector".to_string(),
..simple_config()
};
assert!(validate_config(&config).is_err());
}
#[test]
fn validate_config_accepts_valid_config() {
assert!(validate_config(&simple_config()).is_ok());
}
#[test]
fn metadata_platform_includes_extractor_name() {
let meta = build_metadata("my-blog", "https://example.com", None, None, None);
assert_eq!(meta.platform, "css:my-blog");
}
#[test]
fn metadata_canonical_url_is_set() {
let meta = build_metadata("x", "https://example.com/page", None, None, None);
assert_eq!(meta.canonical_url, "https://example.com/page");
}
#[test]
fn metadata_optional_fields_propagated() {
let meta = build_metadata(
"x",
"https://example.com",
Some("My Title".to_string()),
Some("Alice".to_string()),
Some("2026-01-01".to_string()),
);
assert_eq!(meta.title.as_deref(), Some("My Title"));
assert_eq!(meta.author.as_deref(), Some("Alice"));
assert_eq!(meta.published.as_deref(), Some("2026-01-01"));
}
#[tokio::test]
async fn extract_returns_error_when_no_prefetched_html() {
let client = AcceleratedClient::new().expect("client");
let provider = make_provider(simple_config());
let result = provider
.extract("https://myblog.com/post/1", &client, None, None)
.await;
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(msg.contains("pre-fetched HTML"));
}
#[tokio::test]
async fn extract_returns_empty_markdown_when_selector_misses() {
let client = AcceleratedClient::new().expect("client");
let html = b"<html><body><p>No article here</p></body></html>";
let provider = make_provider(simple_config());
let result = provider
.extract("https://myblog.com/post/1", &client, None, Some(html))
.await
.expect("should succeed even with no match");
assert!(result.markdown.is_empty());
}
#[tokio::test]
async fn extract_converts_article_html_to_markdown() {
let client = AcceleratedClient::new().expect("client");
let html = b"<html><body>\
<h1 class=\"title\">Hello World</h1>\
<article><h2>Section</h2><p>Body text here.</p></article>\
</body></html>";
let provider = make_provider(simple_config());
let result = provider
.extract("https://myblog.com/post/1", &client, None, Some(html))
.await
.expect("extract should succeed");
assert!(!result.markdown.is_empty());
assert!(result.markdown.contains("Body text here"));
assert_eq!(result.metadata.platform, "css:test-blog");
}
#[tokio::test]
async fn extract_populates_title_from_selector() {
let client = AcceleratedClient::new().expect("client");
let html = b"<html><body>\
<h1 class=\"title\">My Post Title</h1>\
<article><p>Content.</p></article>\
</body></html>";
let provider = make_provider(simple_config());
let result = provider
.extract("https://myblog.com/post/1", &client, None, Some(html))
.await
.expect("extract should succeed");
assert_eq!(result.metadata.title.as_deref(), Some("My Post Title"));
}
#[tokio::test]
async fn extract_removes_sidebar_from_content() {
let client = AcceleratedClient::new().expect("client");
let html = b"<html><body>\
<article>\
<p>Main content</p>\
<div class=\"sidebar\">Sidebar ads</div>\
</article>\
</body></html>";
let provider = make_provider(simple_config());
let result = provider
.extract("https://myblog.com/post/1", &client, None, Some(html))
.await
.expect("extract should succeed");
assert!(result.markdown.contains("Main content"));
assert!(!result.markdown.contains("Sidebar ads"));
}
#[tokio::test]
async fn extract_rejects_non_utf8_html() {
let client = AcceleratedClient::new().expect("client");
let html: &[u8] = &[0xFF, 0xFE, 0x00];
let provider = make_provider(simple_config());
let result = provider
.extract("https://myblog.com/post/1", &client, None, Some(html))
.await;
assert!(result.is_err());
}
#[tokio::test]
async fn extract_canonical_url_matches_input_url() {
let client = AcceleratedClient::new().expect("client");
let html = b"<html><body><article><p>Text.</p></article></body></html>";
let provider = make_provider(simple_config());
let result = provider
.extract("https://myblog.com/post/42", &client, None, Some(html))
.await
.expect("extract should succeed");
assert_eq!(result.metadata.canonical_url, "https://myblog.com/post/42");
}
}