1use serde_json::json;
34use std::borrow::Cow;
35use std::collections::HashMap;
36
37use crate::error::{HtmlError, Result, SeoErrorKind};
38use lazy_static::lazy_static;
39use regex::{Captures, Regex};
40use scraper::{Html, Selector};
41
42const MAX_HTML_SIZE: usize = 1_000_000;
45const DEFAULT_PAGE_TYPE: &str = "WebPage";
47const SCHEMA_ORG_CONTEXT: &str = "https://schema.org";
49const DEFAULT_OG_TYPE: &str = "website";
51
52lazy_static! {
54 static ref HTML_ESCAPES: Regex = Regex::new(r#"[&<>"']"#)
56 .expect("Failed to compile HTML escapes regex");
57
58 static ref META_DESC_SELECTOR: Selector = Selector::parse("meta[name='description']")
60 .expect("Failed to compile meta description selector");
61
62 static ref TITLE_SELECTOR: Selector = Selector::parse("title")
64 .expect("Failed to compile title selector");
65
66 static ref PARAGRAPH_SELECTOR: Selector = Selector::parse("p")
68 .expect("Failed to compile paragraph selector");
69}
70
71#[derive(Debug, Clone)]
73pub struct StructuredDataConfig {
74 pub additional_data: Option<HashMap<String, String>>,
76 pub page_type: String,
78 pub additional_types: Vec<String>,
80}
81
82impl Default for StructuredDataConfig {
83 fn default() -> Self {
84 Self {
85 additional_data: None,
86 page_type: String::from(DEFAULT_PAGE_TYPE),
87 additional_types: Vec::new(),
88 }
89 }
90}
91
92impl StructuredDataConfig {
93 fn validate(&self) -> Result<()> {
101 validate_page_type(&self.page_type)?;
102
103 if self.additional_types.iter().any(String::is_empty) {
104 return Err(HtmlError::seo(
105 SeoErrorKind::InvalidStructuredData,
106 "Additional types cannot be empty",
107 None,
108 ));
109 }
110 Ok(())
111 }
112}
113
114#[derive(Debug, Default)]
116pub struct MetaTagsBuilder {
117 title: Option<String>,
119 description: Option<String>,
121 og_type: String,
123 additional_tags: Vec<(String, String)>,
125}
126
127impl MetaTagsBuilder {
128 #[must_use]
130 pub fn new() -> Self {
131 Self {
132 title: None,
133 description: None,
134 og_type: String::from(DEFAULT_OG_TYPE),
135 additional_tags: Vec::new(),
136 }
137 }
138
139 #[must_use]
141 pub fn with_title(mut self, title: impl Into<String>) -> Self {
142 self.title = Some(title.into());
143 self
144 }
145
146 #[must_use]
148 pub fn with_description(mut self, desc: impl Into<String>) -> Self {
149 self.description = Some(desc.into());
150 self
151 }
152
153 #[must_use]
155 pub fn add_meta_tag(
156 mut self,
157 name: impl Into<String>,
158 content: impl Into<String>,
159 ) -> Self {
160 self.additional_tags.push((name.into(), content.into()));
161 self
162 }
163
164 #[must_use]
166 pub fn add_meta_tags<I>(mut self, tags: I) -> Self
167 where
168 I: IntoIterator<Item = (String, String)>,
169 {
170 self.additional_tags.extend(tags);
171 self
172 }
173
174 pub fn build(self) -> Result<String> {
180 let title = self.title.ok_or_else(|| {
181 HtmlError::seo(
182 SeoErrorKind::MissingTitle,
183 "Meta title is required",
184 None,
185 )
186 })?;
187
188 let description = self.description.ok_or_else(|| {
189 HtmlError::seo(
190 SeoErrorKind::MissingDescription,
191 "Meta description is required",
192 None,
193 )
194 })?;
195
196 let mut meta_tags = String::with_capacity(500);
197
198 meta_tags.push_str(&format!(
200 r#"<meta name="title" content="{}">"#,
201 escape_html(&title)
202 ));
203 meta_tags.push_str(&format!(
204 r#"<meta name="description" content="{}">"#,
205 escape_html(&description)
206 ));
207 meta_tags.push_str(&format!(
208 r#"<meta property="og:type" content="{}">"#,
209 escape_html(&self.og_type)
210 ));
211
212 for (name, content) in self.additional_tags {
214 meta_tags.push_str(&format!(
215 r#"<meta name="{}" content="{}">"#,
216 escape_html(&name),
217 escape_html(&content)
218 ));
219 }
220
221 Ok(meta_tags)
222 }
223}
224
225fn validate_page_type(page_type: &str) -> Result<()> {
231 if page_type.is_empty() {
232 return Err(HtmlError::seo(
233 SeoErrorKind::InvalidStructuredData,
234 "Page type cannot be empty",
235 None,
236 ));
237 }
238 Ok(())
239}
240
241#[must_use]
272pub fn escape_html(s: &str) -> Cow<str> {
273 HTML_ESCAPES.replace_all(s, |caps: &Captures| match &caps[0] {
274 "&" => "&",
275 "<" => "<",
276 ">" => ">",
277 "\"" => """,
278 "'" => "'",
279 _ => unreachable!("Regex only matches [&<>\"']"),
280 })
281}
282
283pub fn generate_meta_tags(html: &str) -> Result<String> {
309 if html.len() > MAX_HTML_SIZE {
310 return Err(HtmlError::InputTooLarge(html.len()));
311 }
312
313 let document = Html::parse_document(html);
314 let title = extract_title(&document)?;
315 let description = extract_description(&document)?;
316
317 MetaTagsBuilder::new()
318 .with_title(title)
319 .with_description(description)
320 .build()
321}
322
323pub fn generate_structured_data(
352 html: &str,
353 config: Option<StructuredDataConfig>,
354) -> Result<String> {
355 if html.len() > MAX_HTML_SIZE {
356 return Err(HtmlError::InputTooLarge(html.len()));
357 }
358
359 let document = Html::parse_document(html);
360 let config = config.unwrap_or_default();
361 config.validate()?;
362
363 let title = extract_title(&document)?;
364 let description = extract_description(&document)?;
365
366 let mut json = if config.additional_types.is_empty() {
367 json!({
368 "@context": SCHEMA_ORG_CONTEXT,
369 "@type": config.page_type,
370 "name": title,
371 "description": description,
372 })
373 } else {
374 let mut types = vec![config.page_type];
375 types.extend(config.additional_types);
376 json!({
377 "@context": SCHEMA_ORG_CONTEXT,
378 "@type": types,
379 "name": title,
380 "description": description,
381 })
382 };
383
384 if let Some(additional_data) = config.additional_data {
386 for (key, value) in additional_data {
387 json[key] = json!(value);
388 }
389 }
390
391 Ok(format!(
392 r#"<script type="application/ld+json">
393{}
394</script>"#,
395 serde_json::to_string_pretty(&json).map_err(|e| {
396 HtmlError::InvalidStructuredData(e.to_string())
397 })?
398 ))
399}
400
401fn extract_title(document: &Html) -> Result<String> {
403 document
404 .select(&TITLE_SELECTOR)
405 .next()
406 .map(|t| t.text().collect::<String>())
407 .ok_or_else(|| {
408 HtmlError::MissingHtmlElement("title".to_string())
409 })
410}
411
412fn extract_description(document: &Html) -> Result<String> {
413 if let Some(meta) = document.select(&META_DESC_SELECTOR).next() {
415 if let Some(content) = meta.value().attr("content") {
416 return Ok(content.to_string());
417 }
418 }
419
420 document
422 .select(&PARAGRAPH_SELECTOR)
423 .next()
424 .map(|p| p.text().collect::<String>())
425 .ok_or_else(|| {
426 HtmlError::MissingHtmlElement("description".to_string())
427 })
428}
429
430#[cfg(test)]
431mod tests {
432 use super::*;
433 use test_case::test_case as case;
434
435 mod meta_tags_builder {
437 use super::*;
438
439 #[test]
440 fn handles_duplicate_meta_tags() {
441 let meta_tags = MetaTagsBuilder::new()
442 .with_title("Duplicate Test")
443 .with_description("Testing duplicates")
444 .add_meta_tag("author", "John Doe")
445 .add_meta_tag("author", "Jane Doe")
446 .build()
447 .unwrap();
448
449 assert!(meta_tags.contains(r#"content="John Doe""#));
450 assert!(meta_tags.contains(r#"content="Jane Doe""#));
451 }
452
453 #[test]
454 fn handles_multiple_add_meta_tags_calls() {
455 let mut builder = MetaTagsBuilder::new()
456 .with_title("Test")
457 .with_description("Description");
458 builder = builder.add_meta_tags(vec![(
459 "key1".to_string(),
460 "value1".to_string(),
461 )]);
462 builder = builder.add_meta_tags(vec![(
463 "key2".to_string(),
464 "value2".to_string(),
465 )]);
466 let meta_tags = builder.build().unwrap();
467
468 assert!(meta_tags.contains(r#"content="value1""#));
469 assert!(meta_tags.contains(r#"content="value2""#));
470 }
471
472 #[test]
473 fn builds_basic_meta_tags() {
474 let meta_tags = MetaTagsBuilder::new()
475 .with_title("Test Title")
476 .with_description("Test Description")
477 .add_meta_tag("keywords", "test,keywords")
478 .build()
479 .unwrap();
480
481 assert!(meta_tags.contains(
482 r#"<meta name="title" content="Test Title">"#
483 ));
484 assert!(meta_tags.contains(r#"<meta name="description" content="Test Description">"#));
485 assert!(meta_tags.contains(
486 r#"<meta name="keywords" content="test,keywords">"#
487 ));
488 }
489
490 #[test]
491 fn handles_multiple_meta_tags() {
492 let tags = vec![
493 ("keywords".to_string(), "test,tags".to_string()),
494 ("robots".to_string(), "index,follow".to_string()),
495 ];
496 let meta_tags = MetaTagsBuilder::new()
497 .with_title("Test")
498 .with_description("Test")
499 .add_meta_tags(tags)
500 .build()
501 .unwrap();
502
503 assert!(
504 meta_tags.contains(r#"keywords" content="test,tags"#)
505 );
506 assert!(
507 meta_tags.contains(r#"robots" content="index,follow"#)
508 );
509 }
510
511 #[test]
512 fn fails_without_title() {
513 let result = MetaTagsBuilder::new()
514 .with_description("Test Description")
515 .build();
516
517 assert!(matches!(
518 result,
519 Err(HtmlError::Seo {
520 kind: SeoErrorKind::MissingTitle,
521 ..
522 })
523 ));
524 }
525
526 #[test]
527 fn fails_without_description() {
528 let result =
529 MetaTagsBuilder::new().with_title("Test Title").build();
530
531 assert!(matches!(
532 result,
533 Err(HtmlError::Seo {
534 kind: SeoErrorKind::MissingDescription,
535 ..
536 })
537 ));
538 }
539
540 #[test]
541 fn escapes_special_characters_in_meta_tags() {
542 let meta_tags = MetaTagsBuilder::new()
543 .with_title("Test & Title")
544 .with_description("Test < Description >")
545 .build()
546 .unwrap();
547
548 assert!(meta_tags.contains(r#"content="Test & Title"#));
549 assert!(meta_tags
550 .contains(r#"content="Test < Description >"#));
551 }
552 }
553
554 mod html_escaping {
556 use super::*;
557
558 #[case("<>&\"'" => "<>&"'" ; "escapes all special characters")]
559 #[case("Normal text" => "Normal text" ; "leaves normal text unchanged")]
560 #[case("" => "" ; "handles empty string")]
561 fn escape_html_cases(input: &str) -> String {
562 escape_html(input).into_owned()
563 }
564
565 #[test]
566 fn escapes_mixed_content() {
567 let input = "Text with <tags> & \"quotes\" 'here'";
568 let expected = "Text with <tags> & "quotes" 'here'";
569 assert_eq!(escape_html(input), expected);
570 }
571
572 #[test]
573 fn handles_large_input() {
574 let large_input = "<>".repeat(100_000);
575 let escaped = escape_html(&large_input);
576 assert!(escaped.contains("<>"));
577 }
578 }
579
580 mod structured_data {
582 use super::*;
583
584 #[test]
585 fn handles_deeply_nested_configuration() {
586 let html = r"<html><head><title>Nested Test</title></head><body><p>Description</p></body></html>";
587 let mut additional_data = HashMap::new();
588 _ = additional_data
589 .insert("level1".to_string(), "value1".to_string());
590 _ = additional_data
591 .insert("level2".to_string(), "value2".to_string());
592
593 let config = StructuredDataConfig {
594 page_type: "TestType".to_string(),
595 additional_types: vec!["ExtraType".to_string()],
596 additional_data: Some(additional_data),
597 };
598
599 let result =
600 generate_structured_data(html, Some(config)).unwrap();
601 let json_content = extract_json_from_script(&result);
602 let parsed: serde_json::Value =
603 serde_json::from_str(&json_content).unwrap();
604
605 assert_eq!(
606 parsed["@type"],
607 serde_json::json!(["TestType", "ExtraType"])
608 );
609 assert_eq!(parsed["level1"], "value1");
610 assert_eq!(parsed["level2"], "value2");
611 }
612
613 #[test]
614 fn generates_basic_structured_data() {
615 let html = r"<html><head><title>Test</title></head><body><p>Description</p></body></html>";
616 let result = generate_structured_data(html, None).unwrap();
617
618 let json_content = extract_json_from_script(&result);
619 let parsed: serde_json::Value =
620 serde_json::from_str(&json_content).unwrap();
621
622 assert_eq!(parsed["@type"], "WebPage");
623 assert_eq!(parsed["name"], "Test");
624 assert_eq!(parsed["description"], "Description");
625 }
626
627 #[test]
628 fn generates_multiple_types() {
629 let html = r"<html><head><title>Test</title></head><body><p>Description</p></body></html>";
630 let config = StructuredDataConfig {
631 page_type: "Article".to_string(),
632 additional_types: vec!["WebPage".to_string()],
633 additional_data: Some(HashMap::from([(
634 "author".to_string(),
635 "Test Author".to_string(),
636 )])),
637 };
638
639 let result =
640 generate_structured_data(html, Some(config)).unwrap();
641 let json_content = extract_json_from_script(&result);
642 let parsed: serde_json::Value =
643 serde_json::from_str(&json_content).unwrap();
644
645 assert_eq!(
646 parsed["@type"],
647 serde_json::json!(["Article", "WebPage"]),
648 "Expected @type to include multiple types"
649 );
650 assert_eq!(
651 parsed["author"], "Test Author",
652 "Expected author to be included"
653 );
654 }
655
656 #[test]
657 fn validates_config() {
658 let empty_type = StructuredDataConfig {
659 page_type: "".to_string(),
660 ..Default::default()
661 };
662 assert!(empty_type.validate().is_err());
663
664 let empty_additional = StructuredDataConfig {
665 additional_types: vec!["".to_string()],
666 ..Default::default()
667 };
668 assert!(empty_additional.validate().is_err());
669 }
670
671 fn extract_json_from_script(script: &str) -> String {
673 let json_start =
674 script.find('{').expect("JSON should start with '{'");
675 let json_end =
676 script.rfind('}').expect("JSON should end with '}'");
677 script[json_start..=json_end].to_string()
678 }
679 }
680
681 mod input_validation {
683 use super::*;
684
685 #[test]
686 fn enforces_size_limit_for_meta_tags() {
687 let large_html = "a".repeat(MAX_HTML_SIZE + 1);
688 assert!(matches!(
689 generate_meta_tags(&large_html),
690 Err(HtmlError::InputTooLarge(_))
691 ));
692 }
693
694 #[test]
695 fn enforces_size_limit_for_structured_data() {
696 let large_html = "a".repeat(MAX_HTML_SIZE + 1);
697 assert!(matches!(
698 generate_structured_data(&large_html, None),
699 Err(HtmlError::InputTooLarge(_))
700 ));
701 }
702
703 #[test]
704 fn handles_missing_title() {
705 let html =
706 r"<html><body><p>No title here</p></body></html>";
707 assert!(matches!(
708 generate_meta_tags(html),
709 Err(HtmlError::MissingHtmlElement(ref e)) if e == "title"
710 ));
711 }
712
713 #[test]
714 fn handles_missing_description() {
715 let html =
716 r"<html><head><title>Title only</title></head></html>";
717 assert!(matches!(
718 generate_meta_tags(html),
719 Err(HtmlError::MissingHtmlElement(ref e)) if e == "description"
720 ));
721 }
722
723 #[test]
724 fn invalid_additional_data_keys() {
725 let mut additional_data = HashMap::new();
726 _ = additional_data
727 .insert("<invalid>".to_string(), "value".to_string());
728 let config = StructuredDataConfig {
729 additional_data: Some(additional_data),
730 ..Default::default()
731 };
732 let result =
733 generate_structured_data("<html></html>", Some(config));
734 assert!(result.is_err());
735 }
736 }
737}