use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use url::Url;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct LinkTag {
pub rel: String,
pub href: Option<String>,
pub link_type: Option<String>,
pub title: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DiscoveredFeed {
pub url: String,
pub title: Option<String>,
pub feed_type: String,
pub discovery_method: String,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PageFacts {
pub url: String,
pub final_url: String,
#[serde(skip_serializing)]
pub html: String,
pub headers: HashMap<String, String>,
pub script_srcs: Vec<String>,
pub stylesheet_hrefs: Vec<String>,
pub link_tags: Vec<LinkTag>,
pub meta_tags: HashMap<String, String>,
pub json_ld: Option<serde_json::Value>,
pub json_ld_types: Vec<String>,
pub discovered_feeds: Vec<DiscoveredFeed>,
pub js_globals: Vec<String>,
#[serde(skip_serializing)]
pub js_rendered_html: Option<String>,
pub js_rendered_text: Option<String>,
pub is_spa: bool,
pub has_bot_protection: bool,
pub content_length: usize,
pub title: Option<String>,
pub meta_generator: Option<String>,
}
impl Default for PageFacts {
fn default() -> Self {
Self {
url: String::new(),
final_url: String::new(),
html: String::new(),
headers: HashMap::new(),
script_srcs: Vec::new(),
stylesheet_hrefs: Vec::new(),
link_tags: Vec::new(),
meta_tags: HashMap::new(),
json_ld: None,
json_ld_types: Vec::new(),
discovered_feeds: Vec::new(),
js_globals: Vec::new(),
js_rendered_html: None,
js_rendered_text: None,
is_spa: false,
has_bot_protection: false,
content_length: 0,
title: None,
meta_generator: None,
}
}
}
impl PageFacts {
pub fn new(url: &str, final_url: &str, html: &str) -> Self {
let mut facts = Self {
url: url.to_string(),
final_url: final_url.to_string(),
html: html.to_string(),
..Default::default()
};
facts.extract_features();
facts
}
pub fn with_js_content(
url: &str,
final_url: &str,
http_html: &str,
js_html: Option<&str>,
js_text: Option<&str>,
headers: HashMap<String, String>,
) -> Self {
let mut facts = Self {
url: url.to_string(),
final_url: final_url.to_string(),
html: http_html.to_string(),
headers,
js_rendered_html: js_html.map(String::from),
js_rendered_text: js_text.map(String::from),
..Default::default()
};
facts.extract_features();
facts.detect_spa_indicators();
facts
}
fn extract_features(&mut self) {
let document = Html::parse_document(&self.html);
self.extract_scripts(&document);
self.extract_stylesheets(&document);
self.extract_link_tags(&document);
self.extract_meta_tags(&document);
self.extract_json_ld(&document);
self.discover_feeds(&document);
self.extract_title(&document);
self.detect_bot_protection();
self.calculate_content_length(&document);
}
fn extract_scripts(&mut self, document: &Html) {
if let Ok(selector) = Selector::parse("script[src]") {
for element in document.select(&selector) {
if let Some(src) = element.value().attr("src") {
self.script_srcs.push(src.to_string());
}
}
}
}
fn extract_stylesheets(&mut self, document: &Html) {
if let Ok(selector) = Selector::parse("link[rel='stylesheet'][href]") {
for element in document.select(&selector) {
if let Some(href) = element.value().attr("href") {
self.stylesheet_hrefs.push(href.to_string());
}
}
}
}
fn extract_link_tags(&mut self, document: &Html) {
if let Ok(selector) = Selector::parse("link") {
for element in document.select(&selector) {
let rel = element.value().attr("rel").unwrap_or_default().to_string();
let href = element.value().attr("href").map(String::from);
let link_type = element.value().attr("type").map(String::from);
let title = element.value().attr("title").map(String::from);
if !rel.is_empty() {
self.link_tags.push(LinkTag {
rel,
href,
link_type,
title,
});
}
}
}
}
fn extract_meta_tags(&mut self, document: &Html) {
if let Ok(selector) = Selector::parse("meta[name], meta[property]") {
for element in document.select(&selector) {
let key = element
.value()
.attr("name")
.or_else(|| element.value().attr("property"))
.map(String::from);
let content = element.value().attr("content").map(String::from);
if let (Some(k), Some(c)) = (key, content) {
if k == "generator" {
self.meta_generator = Some(c.clone());
}
self.meta_tags.insert(k, c);
}
}
}
}
fn extract_json_ld(&mut self, document: &Html) {
if let Ok(selector) = Selector::parse(r#"script[type="application/ld+json"]"#) {
let mut json_ld_items: Vec<serde_json::Value> = Vec::new();
for element in document.select(&selector) {
let text: String = element.text().collect();
if let Ok(json) = serde_json::from_str::<serde_json::Value>(&text) {
self.extract_json_ld_types(&json);
json_ld_items.push(json);
}
}
if !json_ld_items.is_empty() {
self.json_ld = if json_ld_items.len() == 1 {
Some(json_ld_items.remove(0))
} else {
Some(serde_json::Value::Array(json_ld_items))
};
}
}
}
fn extract_json_ld_types(&mut self, json: &serde_json::Value) {
match json {
serde_json::Value::Object(map) => {
if let Some(t) = map.get("@type") {
match t {
serde_json::Value::String(s) => {
if !self.json_ld_types.contains(s) {
self.json_ld_types.push(s.clone());
}
}
serde_json::Value::Array(arr) => {
for item in arr {
if let serde_json::Value::String(s) = item {
if !self.json_ld_types.contains(s) {
self.json_ld_types.push(s.clone());
}
}
}
}
_ => {}
}
}
for (key, value) in map {
if key != "@context" {
self.extract_json_ld_types(value);
}
}
}
serde_json::Value::Array(arr) => {
for item in arr {
self.extract_json_ld_types(item);
}
}
_ => {}
}
}
fn discover_feeds(&mut self, document: &Html) {
if let Ok(selector) = Selector::parse(
r#"link[rel="alternate"][type="application/rss+xml"],
link[rel="alternate"][type="application/atom+xml"],
link[rel="alternate"][type="application/feed+json"]"#,
) {
for element in document.select(&selector) {
if let Some(href) = element.value().attr("href") {
let resolved_url = self.resolve_url(href);
let feed_type = element
.value()
.attr("type")
.map(|t| {
if t.contains("atom") {
"atom"
} else if t.contains("json") {
"json"
} else {
"rss"
}
})
.unwrap_or("rss");
let title = element.value().attr("title").map(String::from);
self.discovered_feeds.push(DiscoveredFeed {
url: resolved_url,
title,
feed_type: feed_type.to_string(),
discovery_method: "link-tag".to_string(),
});
}
}
}
}
fn resolve_url(&self, relative: &str) -> String {
if relative.starts_with("http://") || relative.starts_with("https://") {
return relative.to_string();
}
if let Ok(base) = Url::parse(&self.final_url) {
if let Ok(resolved) = base.join(relative) {
return resolved.to_string();
}
}
relative.to_string()
}
fn extract_title(&mut self, document: &Html) {
if let Ok(selector) = Selector::parse("title") {
if let Some(element) = document.select(&selector).next() {
let title: String = element.text().collect();
self.title = Some(title.trim().to_string());
}
}
}
fn detect_bot_protection(&mut self) {
let lower_html = self.html.to_lowercase();
self.has_bot_protection = lower_html.contains("cloudflare")
|| lower_html.contains("cf-ray")
|| lower_html.contains("captcha")
|| lower_html.contains("please enable javascript")
|| lower_html.contains("browser check")
|| lower_html.contains("checking your browser")
|| lower_html.contains("ddos-guard")
|| lower_html.contains("recaptcha");
}
fn calculate_content_length(&mut self, document: &Html) {
if let Ok(body_selector) = Selector::parse("body") {
if let Some(body) = document.select(&body_selector).next() {
let text: String = body.text().collect::<Vec<_>>().join(" ");
let collapsed: String = text.split_whitespace().collect::<Vec<_>>().join(" ");
self.content_length = collapsed.len();
}
}
}
fn detect_spa_indicators(&mut self) {
let lower_html = self.html.to_lowercase();
let spa_indicators = [
"__NEXT_DATA__",
"__NUXT__",
"__GATSBY",
"window.__REDUX",
"window.__APOLLO",
"ng-app",
"ng-view",
"data-reactroot",
"data-react-helmet",
"_app-root",
];
for indicator in spa_indicators {
if self.html.contains(indicator) || lower_html.contains(&indicator.to_lowercase()) {
self.js_globals.push(indicator.to_string());
}
}
let framework_scripts = [
"/_next/",
"/_nuxt/",
"/gatsby-",
"vue.min.js",
"react.production",
"angular.min.js",
];
for script in &self.script_srcs {
for pattern in framework_scripts {
if script.contains(pattern) && !self.js_globals.contains(&pattern.to_string()) {
self.js_globals.push(pattern.to_string());
}
}
}
let http_len = self.content_length;
let js_len = self
.js_rendered_text
.as_ref()
.map(|t| t.len())
.unwrap_or(0);
self.is_spa = !self.js_globals.is_empty()
|| (js_len > http_len * 2 && js_len > 500)
|| (http_len < 300 && js_len > 1000);
}
pub fn html_contains(&self, pattern: &str) -> bool {
self.html.to_lowercase().contains(&pattern.to_lowercase())
}
pub fn has_script_from(&self, host_pattern: &str) -> bool {
self.script_srcs
.iter()
.any(|src| src.contains(host_pattern))
}
pub fn has_stylesheet_from(&self, pattern: &str) -> bool {
self.stylesheet_hrefs.iter().any(|href| href.contains(pattern))
}
pub fn host(&self) -> Option<String> {
Url::parse(&self.final_url)
.ok()
.and_then(|u| u.host_str().map(String::from))
}
pub fn has_json_ld_type(&self, type_name: &str) -> bool {
self.json_ld_types
.iter()
.any(|t| t.eq_ignore_ascii_case(type_name))
}
pub fn summary(&self) -> String {
let mut parts = Vec::new();
if let Some(host) = self.host() {
parts.push(format!("Host: {}", host));
}
if !self.json_ld_types.is_empty() {
parts.push(format!("JSON-LD: {}", self.json_ld_types.join(", ")));
}
if !self.discovered_feeds.is_empty() {
parts.push(format!("{} feed(s)", self.discovered_feeds.len()));
}
if let Some(gen) = &self.meta_generator {
parts.push(format!("Generator: {}", gen));
}
if self.is_spa {
parts.push("SPA detected".to_string());
}
if self.has_bot_protection {
parts.push("Bot protection".to_string());
}
parts.push(format!("Content: {} chars", self.content_length));
parts.join(" | ")
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_extraction() {
let html = r#"
<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
<meta name="generator" content="WordPress 6.0">
<link rel="alternate" type="application/rss+xml" href="/feed.xml" title="RSS">
<link rel="stylesheet" href="/style.css">
<script src="https://example.com/app.js"></script>
</head>
<body>
<p>Hello world</p>
</body>
</html>
"#;
let facts = PageFacts::new("https://example.com", "https://example.com", html);
assert_eq!(facts.title, Some("Test Page".to_string()));
assert_eq!(
facts.meta_generator,
Some("WordPress 6.0".to_string())
);
assert_eq!(facts.discovered_feeds.len(), 1);
assert_eq!(facts.discovered_feeds[0].feed_type, "rss");
assert_eq!(facts.script_srcs.len(), 1);
assert!(!facts.is_spa);
}
#[test]
fn test_json_ld_extraction() {
let html = r#"
<!DOCTYPE html>
<html>
<head>
<script type="application/ld+json">
{
"@context": "https://schema.org",
"@type": "Product",
"name": "Test Product",
"offers": {
"@type": "Offer",
"price": "99.99"
}
}
</script>
</head>
<body></body>
</html>
"#;
let facts = PageFacts::new("https://example.com", "https://example.com", html);
assert!(facts.json_ld.is_some());
assert!(facts.has_json_ld_type("Product"));
assert!(facts.has_json_ld_type("Offer"));
}
#[test]
fn test_spa_detection() {
let html = r#"
<!DOCTYPE html>
<html>
<head>
<script src="/_next/static/chunks/app.js"></script>
</head>
<body>
<div id="__next"></div>
<script id="__NEXT_DATA__" type="application/json">{"props":{}}</script>
</body>
</html>
"#;
let facts = PageFacts::with_js_content(
"https://example.com",
"https://example.com",
html,
None,
None,
HashMap::new(),
);
assert!(facts.is_spa);
assert!(!facts.js_globals.is_empty());
}
#[test]
fn test_bot_protection_detection() {
let html = r#"
<!DOCTYPE html>
<html>
<head></head>
<body>
<div id="cf-wrapper">
Checking your browser before accessing the site.
</div>
</body>
</html>
"#;
let facts = PageFacts::new("https://example.com", "https://example.com", html);
assert!(facts.has_bot_protection);
}
#[test]
fn test_shopify_detection() {
let html = r#"
<!DOCTYPE html>
<html>
<head>
<script src="https://cdn.shopify.com/s/files/1/0123/shop.js"></script>
</head>
<body>
<script>window.Shopify = {};</script>
</body>
</html>
"#;
let facts = PageFacts::new("https://example.myshopify.com", "https://example.myshopify.com", html);
assert!(facts.html_contains("cdn.shopify.com"));
assert!(facts.has_script_from("cdn.shopify.com"));
}
}