use scraper::{Html, Selector};
use serde_json::Value;
pub struct JsonLdExtractor;
impl Default for JsonLdExtractor {
fn default() -> Self {
Self
}
}
impl JsonLdExtractor {
pub fn new() -> Self {
Self
}
pub fn extract(&self, html: &str) -> Vec<Value> {
let document = Html::parse_document(html);
let selector = Selector::parse(r#"script[type="application/ld+json"]"#).unwrap();
document
.select(&selector)
.filter_map(|script| {
let text = script.text().collect::<Vec<_>>().join("");
serde_json::from_str(&text).ok()
})
.collect()
}
pub fn extract_by_type(&self, html: &str, schema_type: &str) -> Vec<Value> {
self.extract(html)
.into_iter()
.filter(|v| self.matches_type(v, schema_type))
.collect()
}
fn matches_type(&self, value: &Value, schema_type: &str) -> bool {
if let Some(t) = value.get("@type") {
match t {
Value::String(s) => s == schema_type,
Value::Array(arr) => arr.iter().any(|v| {
v.as_str().map(|s| s == schema_type).unwrap_or(false)
}),
_ => false,
}
} else if let Some(graph) = value.get("@graph") {
if let Value::Array(items) = graph {
items.iter().any(|item| self.matches_type(item, schema_type))
} else {
false
}
} else {
false
}
}
pub fn extract_faqs(&self, html: &str) -> Vec<FaqItem> {
let json_lds = self.extract_by_type(html, "FAQPage");
let mut faqs = Vec::new();
for json_ld in json_lds {
if let Some(main_entity) = json_ld.get("mainEntity") {
if let Value::Array(questions) = main_entity {
for q in questions {
if let (Some(question), Some(answer)) = (
q.get("name").and_then(|v| v.as_str()),
q.get("acceptedAnswer")
.and_then(|a| a.get("text"))
.and_then(|v| v.as_str()),
) {
faqs.push(FaqItem {
question: question.to_string(),
answer: answer.to_string(),
});
}
}
}
}
}
faqs
}
pub fn extract_howtos(&self, html: &str) -> Vec<HowToItem> {
let json_lds = self.extract_by_type(html, "HowTo");
let mut howtos = Vec::new();
for json_ld in json_lds {
let name = json_ld.get("name").and_then(|v| v.as_str()).map(String::from);
let description = json_ld.get("description").and_then(|v| v.as_str()).map(String::from);
let mut steps = Vec::new();
if let Some(Value::Array(step_list)) = json_ld.get("step") {
for (i, step) in step_list.iter().enumerate() {
let step_name = step.get("name")
.or_else(|| step.get("text"))
.and_then(|v| v.as_str())
.map(String::from);
if let Some(text) = step_name {
steps.push(HowToStep {
position: i + 1,
text,
image: step.get("image").and_then(|v| v.as_str()).map(String::from),
});
}
}
}
howtos.push(HowToItem {
name,
description,
steps,
});
}
howtos
}
pub fn extract_articles(&self, html: &str) -> Vec<ArticleSchema> {
let json_lds = self.extract_by_type(html, "Article");
let mut articles = Vec::new();
for json_ld in json_lds {
articles.push(ArticleSchema {
headline: json_ld.get("headline").and_then(|v| v.as_str()).map(String::from),
author: self.extract_person_name(&json_ld, "author"),
date_published: json_ld.get("datePublished").and_then(|v| v.as_str()).map(String::from),
date_modified: json_ld.get("dateModified").and_then(|v| v.as_str()).map(String::from),
description: json_ld.get("description").and_then(|v| v.as_str()).map(String::from),
image: json_ld.get("image").and_then(|v| {
match v {
Value::String(s) => Some(s.clone()),
Value::Object(o) => o.get("url").and_then(|u| u.as_str()).map(String::from),
Value::Array(a) => a.first().and_then(|i| i.as_str()).map(String::from),
_ => None,
}
}),
});
}
articles
}
fn extract_person_name(&self, json_ld: &Value, field: &str) -> Option<String> {
json_ld.get(field).and_then(|author| {
match author {
Value::String(s) => Some(s.clone()),
Value::Object(o) => o.get("name").and_then(|v| v.as_str()).map(String::from),
Value::Array(a) => a.first().and_then(|p| {
match p {
Value::String(s) => Some(s.clone()),
Value::Object(o) => o.get("name").and_then(|v| v.as_str()).map(String::from),
_ => None,
}
}),
_ => None,
}
})
}
}
#[derive(Debug, Clone)]
pub struct FaqItem {
pub question: String,
pub answer: String,
}
#[derive(Debug, Clone)]
pub struct HowToItem {
pub name: Option<String>,
pub description: Option<String>,
pub steps: Vec<HowToStep>,
}
#[derive(Debug, Clone)]
pub struct HowToStep {
pub position: usize,
pub text: String,
pub image: Option<String>,
}
#[derive(Debug, Clone)]
pub struct ArticleSchema {
pub headline: Option<String>,
pub author: Option<String>,
pub date_published: Option<String>,
pub date_modified: Option<String>,
pub description: Option<String>,
pub image: Option<String>,
}