use serde_json::json;
use std::borrow::Cow;
use std::collections::HashMap;
use crate::error::{HtmlError, Result, SeoErrorKind};
use lazy_static::lazy_static;
use regex::{Captures, Regex};
use scraper::{Html, Selector};
const MAX_HTML_SIZE: usize = 1_000_000;
const DEFAULT_PAGE_TYPE: &str = "WebPage";
const SCHEMA_ORG_CONTEXT: &str = "https://schema.org";
const DEFAULT_OG_TYPE: &str = "website";
lazy_static! {
static ref HTML_ESCAPES: Regex = Regex::new(r#"[&<>"']"#)
.expect("Failed to compile HTML escapes regex");
static ref META_DESC_SELECTOR: Selector = Selector::parse("meta[name='description']")
.expect("Failed to compile meta description selector");
static ref TITLE_SELECTOR: Selector = Selector::parse("title")
.expect("Failed to compile title selector");
static ref PARAGRAPH_SELECTOR: Selector = Selector::parse("p")
.expect("Failed to compile paragraph selector");
}
#[derive(Debug, Clone)]
pub struct StructuredDataConfig {
pub additional_data: Option<HashMap<String, String>>,
pub page_type: String,
pub additional_types: Vec<String>,
}
impl Default for StructuredDataConfig {
fn default() -> Self {
Self {
additional_data: None,
page_type: String::from(DEFAULT_PAGE_TYPE),
additional_types: Vec::new(),
}
}
}
impl StructuredDataConfig {
fn validate(&self) -> Result<()> {
validate_page_type(&self.page_type)?;
if self.additional_types.iter().any(String::is_empty) {
return Err(HtmlError::seo(
SeoErrorKind::InvalidStructuredData,
"Additional types cannot be empty",
None,
));
}
Ok(())
}
}
#[derive(Debug, Default)]
pub struct MetaTagsBuilder {
title: Option<String>,
description: Option<String>,
og_type: String,
additional_tags: Vec<(String, String)>,
}
impl MetaTagsBuilder {
#[must_use]
pub fn new() -> Self {
Self {
title: None,
description: None,
og_type: String::from(DEFAULT_OG_TYPE),
additional_tags: Vec::new(),
}
}
#[must_use]
pub fn with_title(mut self, title: impl Into<String>) -> Self {
self.title = Some(title.into());
self
}
#[must_use]
pub fn with_description(mut self, desc: impl Into<String>) -> Self {
self.description = Some(desc.into());
self
}
#[must_use]
pub fn add_meta_tag(
mut self,
name: impl Into<String>,
content: impl Into<String>,
) -> Self {
self.additional_tags.push((name.into(), content.into()));
self
}
#[must_use]
pub fn add_meta_tags<I>(mut self, tags: I) -> Self
where
I: IntoIterator<Item = (String, String)>,
{
self.additional_tags.extend(tags);
self
}
pub fn build(self) -> Result<String> {
let title = self.title.ok_or_else(|| {
HtmlError::seo(
SeoErrorKind::MissingTitle,
"Meta title is required",
None,
)
})?;
let description = self.description.ok_or_else(|| {
HtmlError::seo(
SeoErrorKind::MissingDescription,
"Meta description is required",
None,
)
})?;
let mut meta_tags = String::with_capacity(500);
meta_tags.push_str(&format!(
r#"<meta name="title" content="{}">"#,
escape_html(&title)
));
meta_tags.push_str(&format!(
r#"<meta name="description" content="{}">"#,
escape_html(&description)
));
meta_tags.push_str(&format!(
r#"<meta property="og:type" content="{}">"#,
escape_html(&self.og_type)
));
for (name, content) in self.additional_tags {
meta_tags.push_str(&format!(
r#"<meta name="{}" content="{}">"#,
escape_html(&name),
escape_html(&content)
));
}
Ok(meta_tags)
}
}
fn validate_page_type(page_type: &str) -> Result<()> {
if page_type.is_empty() {
return Err(HtmlError::seo(
SeoErrorKind::InvalidStructuredData,
"Page type cannot be empty",
None,
));
}
Ok(())
}
#[must_use]
pub fn escape_html(s: &str) -> Cow<str> {
HTML_ESCAPES.replace_all(s, |caps: &Captures| match &caps[0] {
"&" => "&",
"<" => "<",
">" => ">",
"\"" => """,
"'" => "'",
_ => unreachable!("Regex only matches [&<>\"']"),
})
}
pub fn generate_meta_tags(html: &str) -> Result<String> {
if html.len() > MAX_HTML_SIZE {
return Err(HtmlError::InputTooLarge(html.len()));
}
let document = Html::parse_document(html);
let title = extract_title(&document)?;
let description = extract_description(&document)?;
MetaTagsBuilder::new()
.with_title(title)
.with_description(description)
.build()
}
pub fn generate_structured_data(
html: &str,
config: Option<StructuredDataConfig>,
) -> Result<String> {
if html.len() > MAX_HTML_SIZE {
return Err(HtmlError::InputTooLarge(html.len()));
}
let document = Html::parse_document(html);
let config = config.unwrap_or_default();
config.validate()?;
let title = extract_title(&document)?;
let description = extract_description(&document)?;
let mut json = if config.additional_types.is_empty() {
json!({
"@context": SCHEMA_ORG_CONTEXT,
"@type": config.page_type,
"name": title,
"description": description,
})
} else {
let mut types = vec![config.page_type];
types.extend(config.additional_types);
json!({
"@context": SCHEMA_ORG_CONTEXT,
"@type": types,
"name": title,
"description": description,
})
};
if let Some(additional_data) = config.additional_data {
for (key, value) in additional_data {
json[key] = json!(value);
}
}
Ok(format!(
r#"<script type="application/ld+json">
{}
</script>"#,
serde_json::to_string_pretty(&json).map_err(|e| {
HtmlError::InvalidStructuredData(e.to_string())
})?
))
}
fn extract_title(document: &Html) -> Result<String> {
document
.select(&TITLE_SELECTOR)
.next()
.map(|t| t.text().collect::<String>())
.ok_or_else(|| {
HtmlError::MissingHtmlElement("title".to_string())
})
}
fn extract_description(document: &Html) -> Result<String> {
if let Some(meta) = document.select(&META_DESC_SELECTOR).next() {
if let Some(content) = meta.value().attr("content") {
return Ok(content.to_string());
}
}
document
.select(&PARAGRAPH_SELECTOR)
.next()
.map(|p| p.text().collect::<String>())
.ok_or_else(|| {
HtmlError::MissingHtmlElement("description".to_string())
})
}
#[cfg(test)]
mod tests {
use super::*;
use test_case::test_case as case;
mod meta_tags_builder {
use super::*;
#[test]
fn handles_duplicate_meta_tags() {
let meta_tags = MetaTagsBuilder::new()
.with_title("Duplicate Test")
.with_description("Testing duplicates")
.add_meta_tag("author", "John Doe")
.add_meta_tag("author", "Jane Doe")
.build()
.unwrap();
assert!(meta_tags.contains(r#"content="John Doe""#));
assert!(meta_tags.contains(r#"content="Jane Doe""#));
}
#[test]
fn handles_multiple_add_meta_tags_calls() {
let mut builder = MetaTagsBuilder::new()
.with_title("Test")
.with_description("Description");
builder = builder.add_meta_tags(vec![(
"key1".to_string(),
"value1".to_string(),
)]);
builder = builder.add_meta_tags(vec![(
"key2".to_string(),
"value2".to_string(),
)]);
let meta_tags = builder.build().unwrap();
assert!(meta_tags.contains(r#"content="value1""#));
assert!(meta_tags.contains(r#"content="value2""#));
}
#[test]
fn builds_basic_meta_tags() {
let meta_tags = MetaTagsBuilder::new()
.with_title("Test Title")
.with_description("Test Description")
.add_meta_tag("keywords", "test,keywords")
.build()
.unwrap();
assert!(meta_tags.contains(
r#"<meta name="title" content="Test Title">"#
));
assert!(meta_tags.contains(r#"<meta name="description" content="Test Description">"#));
assert!(meta_tags.contains(
r#"<meta name="keywords" content="test,keywords">"#
));
}
#[test]
fn handles_multiple_meta_tags() {
let tags = vec![
("keywords".to_string(), "test,tags".to_string()),
("robots".to_string(), "index,follow".to_string()),
];
let meta_tags = MetaTagsBuilder::new()
.with_title("Test")
.with_description("Test")
.add_meta_tags(tags)
.build()
.unwrap();
assert!(
meta_tags.contains(r#"keywords" content="test,tags"#)
);
assert!(
meta_tags.contains(r#"robots" content="index,follow"#)
);
}
#[test]
fn fails_without_title() {
let result = MetaTagsBuilder::new()
.with_description("Test Description")
.build();
assert!(matches!(
result,
Err(HtmlError::Seo {
kind: SeoErrorKind::MissingTitle,
..
})
));
}
#[test]
fn fails_without_description() {
let result =
MetaTagsBuilder::new().with_title("Test Title").build();
assert!(matches!(
result,
Err(HtmlError::Seo {
kind: SeoErrorKind::MissingDescription,
..
})
));
}
#[test]
fn escapes_special_characters_in_meta_tags() {
let meta_tags = MetaTagsBuilder::new()
.with_title("Test & Title")
.with_description("Test < Description >")
.build()
.unwrap();
assert!(meta_tags.contains(r#"content="Test & Title"#));
assert!(meta_tags
.contains(r#"content="Test < Description >"#));
}
}
mod html_escaping {
use super::*;
#[case("<>&\"'" => "<>&"'" ; "escapes all special characters")]
#[case("Normal text" => "Normal text" ; "leaves normal text unchanged")]
#[case("" => "" ; "handles empty string")]
fn escape_html_cases(input: &str) -> String {
escape_html(input).into_owned()
}
#[test]
fn escapes_mixed_content() {
let input = "Text with <tags> & \"quotes\" 'here'";
let expected = "Text with <tags> & "quotes" 'here'";
assert_eq!(escape_html(input), expected);
}
#[test]
fn handles_large_input() {
let large_input = "<>".repeat(100_000);
let escaped = escape_html(&large_input);
assert!(escaped.contains("<>"));
}
}
mod structured_data {
use super::*;
#[test]
fn handles_deeply_nested_configuration() {
let html = r"<html><head><title>Nested Test</title></head><body><p>Description</p></body></html>";
let mut additional_data = HashMap::new();
_ = additional_data
.insert("level1".to_string(), "value1".to_string());
_ = additional_data
.insert("level2".to_string(), "value2".to_string());
let config = StructuredDataConfig {
page_type: "TestType".to_string(),
additional_types: vec!["ExtraType".to_string()],
additional_data: Some(additional_data),
};
let result =
generate_structured_data(html, Some(config)).unwrap();
let json_content = extract_json_from_script(&result);
let parsed: serde_json::Value =
serde_json::from_str(&json_content).unwrap();
assert_eq!(
parsed["@type"],
serde_json::json!(["TestType", "ExtraType"])
);
assert_eq!(parsed["level1"], "value1");
assert_eq!(parsed["level2"], "value2");
}
#[test]
fn generates_basic_structured_data() {
let html = r"<html><head><title>Test</title></head><body><p>Description</p></body></html>";
let result = generate_structured_data(html, None).unwrap();
let json_content = extract_json_from_script(&result);
let parsed: serde_json::Value =
serde_json::from_str(&json_content).unwrap();
assert_eq!(parsed["@type"], "WebPage");
assert_eq!(parsed["name"], "Test");
assert_eq!(parsed["description"], "Description");
}
#[test]
fn generates_multiple_types() {
let html = r"<html><head><title>Test</title></head><body><p>Description</p></body></html>";
let config = StructuredDataConfig {
page_type: "Article".to_string(),
additional_types: vec!["WebPage".to_string()],
additional_data: Some(HashMap::from([(
"author".to_string(),
"Test Author".to_string(),
)])),
};
let result =
generate_structured_data(html, Some(config)).unwrap();
let json_content = extract_json_from_script(&result);
let parsed: serde_json::Value =
serde_json::from_str(&json_content).unwrap();
assert_eq!(
parsed["@type"],
serde_json::json!(["Article", "WebPage"]),
"Expected @type to include multiple types"
);
assert_eq!(
parsed["author"], "Test Author",
"Expected author to be included"
);
}
#[test]
fn validates_config() {
let empty_type = StructuredDataConfig {
page_type: "".to_string(),
..Default::default()
};
assert!(empty_type.validate().is_err());
let empty_additional = StructuredDataConfig {
additional_types: vec!["".to_string()],
..Default::default()
};
assert!(empty_additional.validate().is_err());
}
fn extract_json_from_script(script: &str) -> String {
let json_start =
script.find('{').expect("JSON should start with '{'");
let json_end =
script.rfind('}').expect("JSON should end with '}'");
script[json_start..=json_end].to_string()
}
}
mod input_validation {
use super::*;
#[test]
fn enforces_size_limit_for_meta_tags() {
let large_html = "a".repeat(MAX_HTML_SIZE + 1);
assert!(matches!(
generate_meta_tags(&large_html),
Err(HtmlError::InputTooLarge(_))
));
}
#[test]
fn enforces_size_limit_for_structured_data() {
let large_html = "a".repeat(MAX_HTML_SIZE + 1);
assert!(matches!(
generate_structured_data(&large_html, None),
Err(HtmlError::InputTooLarge(_))
));
}
#[test]
fn handles_missing_title() {
let html =
r"<html><body><p>No title here</p></body></html>";
assert!(matches!(
generate_meta_tags(html),
Err(HtmlError::MissingHtmlElement(ref e)) if e == "title"
));
}
#[test]
fn handles_missing_description() {
let html =
r"<html><head><title>Title only</title></head></html>";
assert!(matches!(
generate_meta_tags(html),
Err(HtmlError::MissingHtmlElement(ref e)) if e == "description"
));
}
#[test]
fn invalid_additional_data_keys() {
let mut additional_data = HashMap::new();
_ = additional_data
.insert("<invalid>".to_string(), "value".to_string());
let config = StructuredDataConfig {
additional_data: Some(additional_data),
..Default::default()
};
let result =
generate_structured_data("<html></html>", Some(config));
assert!(result.is_err());
}
}
}