use scraper::{Html, Selector};
use crate::error::{KtoError, Result};
use crate::fetch::PageContent;
use crate::watch::Extraction;
pub fn extract(content: &PageContent, strategy: &Extraction) -> Result<String> {
match strategy {
Extraction::Auto => extract_auto(content),
Extraction::Selector { selector } => extract_selector(&content.html, selector),
Extraction::Full => extract_full(content),
Extraction::Meta { tags } => extract_meta(&content.html, tags),
Extraction::Rss => extract_rss(content),
Extraction::JsonLd { types } => extract_jsonld(&content.html, types.as_ref()),
}
}
fn extract_rss(content: &PageContent) -> Result<String> {
content.text.clone().ok_or_else(|| {
KtoError::ExtractionError("No RSS content available - feed may be empty or malformed".into())
})
}
const MIN_READABILITY_CHARS: usize = 50;
fn extract_auto(content: &PageContent) -> Result<String> {
if let Ok(readability) = readability_js::Readability::new() {
if let Ok(article) = readability.parse(&content.html) {
let text = article.text_content.trim();
if text.len() >= MIN_READABILITY_CHARS {
return Ok(article.text_content);
}
}
}
if let Some(ref text) = content.text {
if !text.trim().is_empty() {
return Ok(text.clone());
}
}
let document = Html::parse_document(&content.html);
let body_selector =
Selector::parse("body").map_err(|e| KtoError::ExtractionError(format!("{:?}", e)))?;
if let Some(body) = document.select(&body_selector).next() {
let text: String = body.text().collect::<Vec<_>>().join(" ");
if !text.trim().is_empty() {
return Ok(text);
}
}
Err(KtoError::ExtractionError(
"Could not extract any content from page".into(),
))
}
fn extract_selector(html: &str, selector_str: &str) -> Result<String> {
let document = Html::parse_document(html);
let selector = Selector::parse(selector_str)
.map_err(|e| KtoError::ExtractionError(format!("Invalid selector: {:?}", e)))?;
let texts: Vec<String> = document
.select(&selector)
.map(|el| el.text().collect::<Vec<_>>().join(" "))
.collect();
if texts.is_empty() {
return Err(KtoError::ExtractionError(format!(
"Selector '{}' matched no elements",
selector_str
)));
}
Ok(texts.join("\n"))
}
fn extract_full(content: &PageContent) -> Result<String> {
if let Some(ref text) = content.text {
return Ok(text.clone());
}
let document = Html::parse_document(&content.html);
let body_selector =
Selector::parse("body").map_err(|e| KtoError::ExtractionError(format!("{:?}", e)))?;
if let Some(body) = document.select(&body_selector).next() {
let text: String = body.text().collect::<Vec<_>>().join(" ");
return Ok(text);
}
Err(KtoError::ExtractionError(
"Could not find body element".into(),
))
}
fn extract_meta(html: &str, tags: &[String]) -> Result<String> {
let document = Html::parse_document(html);
let mut values = Vec::new();
for tag in tags {
let name_selector = Selector::parse(&format!("meta[name=\"{}\"]", tag))
.map_err(|e| KtoError::ExtractionError(format!("{:?}", e)))?;
for el in document.select(&name_selector) {
if let Some(content) = el.value().attr("content") {
values.push(content.to_string());
}
}
let prop_selector = Selector::parse(&format!("meta[property=\"{}\"]", tag))
.map_err(|e| KtoError::ExtractionError(format!("{:?}", e)))?;
for el in document.select(&prop_selector) {
if let Some(content) = el.value().attr("content") {
values.push(content.to_string());
}
}
}
if values.is_empty() {
return Err(KtoError::ExtractionError(format!(
"No meta tags found for: {:?}",
tags
)));
}
Ok(values.join("\n"))
}
fn extract_jsonld(html: &str, type_filter: Option<&Vec<String>>) -> Result<String> {
let document = Html::parse_document(html);
let jsonld_selector = Selector::parse(r#"script[type="application/ld+json"]"#)
.map_err(|e| KtoError::ExtractionError(format!("{:?}", e)))?;
let mut extracted_items = Vec::new();
for script in document.select(&jsonld_selector) {
let text: String = script.text().collect();
if let Ok(json) = serde_json::from_str::<serde_json::Value>(&text) {
let items = flatten_jsonld(&json, type_filter);
extracted_items.extend(items);
}
}
if extracted_items.is_empty() {
return Err(KtoError::ExtractionError(
"No JSON-LD structured data found".into(),
));
}
let formatted = extracted_items
.into_iter()
.map(format_jsonld_item)
.collect::<Vec<_>>()
.join("\n\n---\n\n");
Ok(formatted)
}
fn flatten_jsonld(
json: &serde_json::Value,
type_filter: Option<&Vec<String>>,
) -> Vec<serde_json::Value> {
let mut items = Vec::new();
match json {
serde_json::Value::Object(map) => {
if let Some(serde_json::Value::Array(graph)) = map.get("@graph") {
for item in graph {
items.extend(flatten_jsonld(item, type_filter));
}
} else {
if matches_type_filter(json, type_filter) {
items.push(json.clone());
}
}
}
serde_json::Value::Array(arr) => {
for item in arr {
items.extend(flatten_jsonld(item, type_filter));
}
}
_ => {}
}
items
}
fn matches_type_filter(json: &serde_json::Value, filter: Option<&Vec<String>>) -> bool {
let filter = match filter {
Some(f) if !f.is_empty() => f,
_ => return true, };
if let Some(type_value) = json.get("@type") {
match type_value {
serde_json::Value::String(s) => type_matches_filter(s, filter),
serde_json::Value::Array(arr) => arr.iter().any(|t| {
if let serde_json::Value::String(s) = t {
type_matches_filter(s, filter)
} else {
false
}
}),
_ => false,
}
} else {
false
}
}
fn type_matches_filter(type_str: &str, filter: &[String]) -> bool {
let clean_type = type_str
.rsplit_once(':')
.map(|(_, t)| t)
.unwrap_or(type_str);
filter.iter().any(|f| {
let clean_filter = f.rsplit_once(':').map(|(_, t)| t).unwrap_or(f);
clean_type.eq_ignore_ascii_case(clean_filter)
|| clean_type.to_lowercase().contains(&clean_filter.to_lowercase())
})
}
fn format_jsonld_item(json: serde_json::Value) -> String {
let mut lines = Vec::new();
if let Some(t) = json.get("@type") {
let type_str = match t {
serde_json::Value::String(s) => s.clone(),
serde_json::Value::Array(arr) => arr
.iter()
.filter_map(|v| v.as_str())
.collect::<Vec<_>>()
.join(", "),
_ => "Unknown".to_string(),
};
lines.push(format!("[{}]", type_str));
}
let type_hint = json.get("@type").and_then(|t| t.as_str()).unwrap_or("");
if type_hint.eq_ignore_ascii_case("product") {
if let Some(name) = json.get("name").and_then(|v| v.as_str()) {
lines.push(format!("Name: {}", name));
}
if let Some(offers) = json.get("offers") {
extract_offer_info(&mut lines, offers);
}
if let Some(sku) = json.get("sku").and_then(|v| v.as_str()) {
lines.push(format!("SKU: {}", sku));
}
if let Some(brand) = extract_nested_name(&json, "brand") {
lines.push(format!("Brand: {}", brand));
}
}
else if type_hint.eq_ignore_ascii_case("article")
|| type_hint.eq_ignore_ascii_case("newsarticle")
|| type_hint.eq_ignore_ascii_case("blogposting")
{
if let Some(headline) = json.get("headline").and_then(|v| v.as_str()) {
lines.push(format!("Headline: {}", headline));
}
if let Some(author) = extract_nested_name(&json, "author") {
lines.push(format!("Author: {}", author));
}
if let Some(date) = json.get("datePublished").and_then(|v| v.as_str()) {
lines.push(format!("Published: {}", date));
}
if let Some(date) = json.get("dateModified").and_then(|v| v.as_str()) {
lines.push(format!("Modified: {}", date));
}
}
else if type_hint.eq_ignore_ascii_case("event") {
if let Some(name) = json.get("name").and_then(|v| v.as_str()) {
lines.push(format!("Event: {}", name));
}
if let Some(date) = json.get("startDate").and_then(|v| v.as_str()) {
lines.push(format!("Start: {}", date));
}
if let Some(date) = json.get("endDate").and_then(|v| v.as_str()) {
lines.push(format!("End: {}", date));
}
if let Some(loc) = extract_nested_name(&json, "location") {
lines.push(format!("Location: {}", loc));
}
}
else {
if let Some(name) = json.get("name").and_then(|v| v.as_str()) {
lines.push(format!("Name: {}", name));
}
if let Some(desc) = json.get("description").and_then(|v| v.as_str()) {
let truncated: String = desc.chars().take(200).collect();
lines.push(format!("Description: {}...", truncated));
}
}
if lines.len() <= 1 {
format!("{}\n{}", lines.join("\n"), json.to_string())
} else {
lines.join("\n")
}
}
fn extract_offer_info(lines: &mut Vec<String>, offers: &serde_json::Value) {
let offer_list: Vec<&serde_json::Value> = match offers {
serde_json::Value::Array(arr) => arr.iter().collect(),
obj @ serde_json::Value::Object(_) => vec![obj],
_ => return,
};
let max_offers = 5;
let show_count = offer_list.len() > max_offers;
for (i, offer) in offer_list.iter().take(max_offers).enumerate() {
let mut offer_parts = Vec::new();
if let Some(seller) = offer.get("seller").and_then(|s| {
s.get("name").and_then(|n| n.as_str())
.or_else(|| s.as_str())
}) {
offer_parts.push(seller.to_string());
}
if let Some(price) = offer.get("price") {
let currency = offer
.get("priceCurrency")
.and_then(|c| c.as_str())
.unwrap_or("USD");
let price_str = match price {
serde_json::Value::String(s) => s.clone(),
serde_json::Value::Number(n) => n.to_string(),
_ => price.to_string(),
};
offer_parts.push(format!("{} {}", price_str, currency));
}
if let Some(avail) = offer.get("availability").and_then(|v| v.as_str()) {
let avail_clean = avail
.replace("https://schema.org/", "")
.replace("http://schema.org/", "");
offer_parts.push(avail_clean);
}
if !offer_parts.is_empty() {
if offer_list.len() > 1 {
lines.push(format!("Offer {}: {}", i + 1, offer_parts.join(" - ")));
} else {
if let Some(price_part) = offer_parts.iter().find(|p| p.contains("USD") || p.contains("EUR") || p.contains("GBP") || p.chars().next().map(|c| c.is_ascii_digit()).unwrap_or(false)) {
lines.push(format!("Price: {}", price_part));
}
if let Some(avail_part) = offer_parts.iter().find(|p| p.contains("Stock") || p.contains("Available")) {
lines.push(format!("Availability: {}", avail_part));
}
}
}
}
if show_count {
lines.push(format!("... and {} more offers", offer_list.len() - max_offers));
}
}
fn extract_nested_name(json: &serde_json::Value, field: &str) -> Option<String> {
let value = json.get(field)?;
match value {
serde_json::Value::String(s) => Some(s.clone()),
serde_json::Value::Object(_) => value.get("name").and_then(|n| n.as_str().map(String::from)),
serde_json::Value::Array(arr) => arr.first().and_then(|v| match v {
serde_json::Value::String(s) => Some(s.clone()),
serde_json::Value::Object(_) => v.get("name").and_then(|n| n.as_str().map(String::from)),
_ => None,
}),
_ => None,
}
}
pub fn extract_title(html: &str) -> Option<String> {
let document = Html::parse_document(html);
let title_selector = Selector::parse("title").ok()?;
document
.select(&title_selector)
.next()
.map(|el| el.text().collect::<Vec<_>>().join(""))
}
pub fn extract_raw_jsonld(html: &str) -> Option<String> {
let document = Html::parse_document(html);
let jsonld_selector = Selector::parse(r#"script[type="application/ld+json"]"#).ok()?;
let mut all_jsonld = Vec::new();
for script in document.select(&jsonld_selector) {
let text: String = script.text().collect();
if let Ok(json) = serde_json::from_str::<serde_json::Value>(&text) {
all_jsonld.push(json);
}
}
if all_jsonld.is_empty() {
return None;
}
if all_jsonld.len() == 1 {
serde_json::to_string_pretty(&all_jsonld[0]).ok()
} else {
serde_json::to_string_pretty(&all_jsonld).ok()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_selector() {
let html = r#"<html><body><div class="content">Hello World</div></body></html>"#;
let result = extract_selector(html, ".content").unwrap();
assert_eq!(result.trim(), "Hello World");
}
#[test]
fn test_extract_meta() {
let html = r#"
<html>
<head>
<meta name="description" content="Test description">
<meta property="og:title" content="OG Title">
</head>
<body></body>
</html>
"#;
let result = extract_meta(html, &["description".into(), "og:title".into()]).unwrap();
assert!(result.contains("Test description"));
assert!(result.contains("OG Title"));
}
#[test]
fn test_extract_title() {
let html = r#"<html><head><title>My Page Title</title></head><body></body></html>"#;
let title = extract_title(html);
assert_eq!(title, Some("My Page Title".to_string()));
}
#[test]
fn test_jsonld_product_single_offer() {
let html = r#"
<html><head>
<script type="application/ld+json">
{
"@type": "Product",
"name": "Test Product",
"offers": {
"price": "99.99",
"priceCurrency": "USD",
"availability": "https://schema.org/InStock"
}
}
</script>
</head><body></body></html>
"#;
let result = extract_jsonld(html, None).unwrap();
assert!(result.contains("[Product]"));
assert!(result.contains("Test Product"));
assert!(result.contains("99.99 USD"));
assert!(result.contains("InStock"));
}
#[test]
fn test_jsonld_product_multiple_offers() {
let html = r#"
<html><head>
<script type="application/ld+json">
{
"@type": "Product",
"name": "Multi-Seller Product",
"offers": [
{"seller": {"name": "Seller A"}, "price": "50.00", "priceCurrency": "USD"},
{"seller": {"name": "Seller B"}, "price": "55.00", "priceCurrency": "USD"},
{"seller": {"name": "Seller C"}, "price": "52.00", "priceCurrency": "USD"}
]
}
</script>
</head><body></body></html>
"#;
let result = extract_jsonld(html, None).unwrap();
assert!(result.contains("Offer 1:"));
assert!(result.contains("Offer 2:"));
assert!(result.contains("Offer 3:"));
assert!(result.contains("Seller A"));
assert!(result.contains("Seller B"));
assert!(result.contains("50.00 USD"));
}
#[test]
fn test_jsonld_namespaced_type() {
let html = r#"
<html><head>
<script type="application/ld+json">
{
"@type": "schema:Product",
"name": "Namespaced Product"
}
</script>
</head><body></body></html>
"#;
let result = extract_jsonld(html, Some(&vec!["Product".to_string()])).unwrap();
assert!(result.contains("Namespaced Product"));
}
#[test]
fn test_jsonld_type_filter() {
let html = r#"
<html><head>
<script type="application/ld+json">
[
{"@type": "Product", "name": "Product Item"},
{"@type": "Organization", "name": "Org Item"}
]
</script>
</head><body></body></html>
"#;
let result = extract_jsonld(html, Some(&vec!["Product".to_string()])).unwrap();
assert!(result.contains("Product Item"));
assert!(!result.contains("Org Item"));
}
#[test]
fn test_jsonld_graph_structure() {
let html = r#"
<html><head>
<script type="application/ld+json">
{
"@graph": [
{"@type": "Article", "headline": "Test Article"},
{"@type": "Organization", "name": "Test Org"}
]
}
</script>
</head><body></body></html>
"#;
let result = extract_jsonld(html, None).unwrap();
assert!(result.contains("Test Article"));
assert!(result.contains("Test Org"));
}
#[test]
fn test_jsonld_article() {
let html = r#"
<html><head>
<script type="application/ld+json">
{
"@type": "NewsArticle",
"headline": "Breaking News",
"author": {"name": "John Doe"},
"datePublished": "2024-01-15"
}
</script>
</head><body></body></html>
"#;
let result = extract_jsonld(html, None).unwrap();
assert!(result.contains("Breaking News"));
assert!(result.contains("John Doe"));
assert!(result.contains("2024-01-15"));
}
#[test]
fn test_jsonld_event() {
let html = r#"
<html><head>
<script type="application/ld+json">
{
"@type": "Event",
"name": "Concert",
"startDate": "2024-06-01",
"location": {"name": "Arena"}
}
</script>
</head><body></body></html>
"#;
let result = extract_jsonld(html, None).unwrap();
assert!(result.contains("Concert"));
assert!(result.contains("2024-06-01"));
assert!(result.contains("Arena"));
}
#[test]
fn test_jsonld_no_data() {
let html = r#"<html><head></head><body></body></html>"#;
let result = extract_jsonld(html, None);
assert!(result.is_err());
}
#[test]
fn test_type_matches_filter_basic() {
assert!(type_matches_filter("Product", &["Product".to_string()]));
assert!(type_matches_filter("product", &["Product".to_string()])); assert!(!type_matches_filter("Article", &["Product".to_string()]));
}
#[test]
fn test_type_matches_filter_namespaced() {
assert!(type_matches_filter("schema:Product", &["Product".to_string()]));
assert!(type_matches_filter("http://schema.org/Product", &["Product".to_string()]));
}
#[test]
fn test_type_matches_filter_partial() {
assert!(type_matches_filter("NewsArticle", &["Article".to_string()]));
assert!(type_matches_filter("BlogPosting", &["Blog".to_string()]));
}
#[test]
fn test_offers_limit() {
let html = r#"
<html><head>
<script type="application/ld+json">
{
"@type": "Product",
"name": "Many Offers Product",
"offers": [
{"price": "10", "priceCurrency": "USD"},
{"price": "11", "priceCurrency": "USD"},
{"price": "12", "priceCurrency": "USD"},
{"price": "13", "priceCurrency": "USD"},
{"price": "14", "priceCurrency": "USD"},
{"price": "15", "priceCurrency": "USD"},
{"price": "16", "priceCurrency": "USD"}
]
}
</script>
</head><body></body></html>
"#;
let result = extract_jsonld(html, None).unwrap();
assert!(result.contains("Offer 5:"));
assert!(result.contains("... and 2 more offers"));
assert!(!result.contains("Offer 6:"));
}
#[test]
fn test_extract_raw_jsonld() {
let html = r#"
<html><head>
<script type="application/ld+json">
{"@type": "Product", "name": "Test Product", "sku": "12345"}
</script>
</head><body></body></html>
"#;
let result = extract_raw_jsonld(html).unwrap();
assert!(result.contains("\"@type\": \"Product\""));
assert!(result.contains("\"name\": \"Test Product\""));
assert!(result.contains("\"sku\": \"12345\""));
}
#[test]
fn test_extract_raw_jsonld_none() {
let html = r#"<html><head></head><body></body></html>"#;
let result = extract_raw_jsonld(html);
assert!(result.is_none());
}
}