html_generator/
seo.rs

1// Copyright © 2025 HTML Generator. All rights reserved.
2// SPDX-License-Identifier: Apache-2.0 OR MIT
3
4//! Search Engine Optimization (SEO) functionality for HTML processing.
5//!
6//! This module provides tools for improving the SEO of web pages through automated
7//! meta tag generation and structured data implementation. It includes features for:
8//! - Meta tag generation for improved search engine visibility
9//! - Structured data (JSON-LD) generation for rich search results
10//! - HTML content analysis for SEO optimization
11//! - Safe HTML entity escaping
12//!
13//! # Examples
14//!
15//! ```rust
16//! use html_generator::seo::{MetaTagsBuilder, generate_structured_data};
17//!
18//! # fn main() -> Result<(), Box<dyn std::error::Error>> {
19//! let html = r#"<html><head><title>My Page</title></head><body><p>Content</p></body></html>"#;
20//!
21//! // Generate meta tags
22//! let meta_tags = MetaTagsBuilder::new()
23//!     .with_title("My Page")
24//!     .with_description("Page content")
25//!     .build()?;
26//!
27//! // Generate structured data
28//! let structured_data = generate_structured_data(html, None)?;
29//! # Ok(())
30//! # }
31//! ```
32
33use serde_json::json;
34use std::borrow::Cow;
35use std::collections::HashMap;
36
37use crate::error::{HtmlError, Result, SeoErrorKind};
38use lazy_static::lazy_static;
39use regex::{Captures, Regex};
40use scraper::{Html, Selector};
41
42// Constants
43/// Maximum allowed size for HTML input (1MB)
44const MAX_HTML_SIZE: usize = 1_000_000;
45/// Default page type for structured data
46const DEFAULT_PAGE_TYPE: &str = "WebPage";
47/// Schema.org context URL
48const SCHEMA_ORG_CONTEXT: &str = "https://schema.org";
49/// Default OpenGraph type
50const DEFAULT_OG_TYPE: &str = "website";
51
52// Compile regular expressions at compile time
53lazy_static! {
54    /// Regular expression for matching HTML special characters
55    static ref HTML_ESCAPES: Regex = Regex::new(r#"[&<>"']"#)
56        .expect("Failed to compile HTML escapes regex");
57
58    /// Regular expression for extracting meta description
59    static ref META_DESC_SELECTOR: Selector = Selector::parse("meta[name='description']")
60        .expect("Failed to compile meta description selector");
61
62    /// Regular expression for extracting title
63    static ref TITLE_SELECTOR: Selector = Selector::parse("title")
64        .expect("Failed to compile title selector");
65
66    /// Regular expression for extracting paragraphs
67    static ref PARAGRAPH_SELECTOR: Selector = Selector::parse("p")
68        .expect("Failed to compile paragraph selector");
69}
70
71/// Configuration options for structured data generation.
72#[derive(Debug, Clone)]
73pub struct StructuredDataConfig {
74    /// Additional key-value pairs to include in the structured data
75    pub additional_data: Option<HashMap<String, String>>,
76    /// The type of webpage (e.g., "WebPage", "Article", "Product")
77    pub page_type: String,
78    /// Additional schema.org types to include
79    pub additional_types: Vec<String>,
80}
81
82impl Default for StructuredDataConfig {
83    fn default() -> Self {
84        Self {
85            additional_data: None,
86            page_type: String::from(DEFAULT_PAGE_TYPE),
87            additional_types: Vec::new(),
88        }
89    }
90}
91
92impl StructuredDataConfig {
93    /// Validates the configuration.
94    ///
95    /// # Errors
96    ///
97    /// Returns an error if:
98    /// - The page type is empty
99    /// - Any additional type is empty
100    fn validate(&self) -> Result<()> {
101        validate_page_type(&self.page_type)?;
102
103        if self.additional_types.iter().any(String::is_empty) {
104            return Err(HtmlError::seo(
105                SeoErrorKind::InvalidStructuredData,
106                "Additional types cannot be empty",
107                None,
108            ));
109        }
110        Ok(())
111    }
112}
113
114/// Builder for constructing meta tags.
115#[derive(Debug, Default)]
116pub struct MetaTagsBuilder {
117    /// Title for the meta tags
118    title: Option<String>,
119    /// Description for the meta tags
120    description: Option<String>,
121    /// OpenGraph type
122    og_type: String,
123    /// Additional meta tags
124    additional_tags: Vec<(String, String)>,
125}
126
127impl MetaTagsBuilder {
128    /// Creates a new `MetaTagsBuilder` with default values.
129    #[must_use]
130    pub fn new() -> Self {
131        Self {
132            title: None,
133            description: None,
134            og_type: String::from(DEFAULT_OG_TYPE),
135            additional_tags: Vec::new(),
136        }
137    }
138
139    /// Sets the title for the meta tags.
140    #[must_use]
141    pub fn with_title(mut self, title: impl Into<String>) -> Self {
142        self.title = Some(title.into());
143        self
144    }
145
146    /// Sets the description for the meta tags.
147    #[must_use]
148    pub fn with_description(mut self, desc: impl Into<String>) -> Self {
149        self.description = Some(desc.into());
150        self
151    }
152
153    /// Adds an additional meta tag.
154    #[must_use]
155    pub fn add_meta_tag(
156        mut self,
157        name: impl Into<String>,
158        content: impl Into<String>,
159    ) -> Self {
160        self.additional_tags.push((name.into(), content.into()));
161        self
162    }
163
164    /// Adds multiple meta tags at once.
165    #[must_use]
166    pub fn add_meta_tags<I>(mut self, tags: I) -> Self
167    where
168        I: IntoIterator<Item = (String, String)>,
169    {
170        self.additional_tags.extend(tags);
171        self
172    }
173
174    /// Builds the meta tags string.
175    ///
176    /// # Errors
177    ///
178    /// Returns an error if required fields (title or description) are missing.
179    pub fn build(self) -> Result<String> {
180        let title = self.title.ok_or_else(|| {
181            HtmlError::seo(
182                SeoErrorKind::MissingTitle,
183                "Meta title is required",
184                None,
185            )
186        })?;
187
188        let description = self.description.ok_or_else(|| {
189            HtmlError::seo(
190                SeoErrorKind::MissingDescription,
191                "Meta description is required",
192                None,
193            )
194        })?;
195
196        let mut meta_tags = String::with_capacity(500);
197
198        // Add required meta tags
199        meta_tags.push_str(&format!(
200            r#"<meta name="title" content="{}">"#,
201            escape_html(&title)
202        ));
203        meta_tags.push_str(&format!(
204            r#"<meta name="description" content="{}">"#,
205            escape_html(&description)
206        ));
207        meta_tags.push_str(&format!(
208            r#"<meta property="og:type" content="{}">"#,
209            escape_html(&self.og_type)
210        ));
211
212        // Add additional meta tags
213        for (name, content) in self.additional_tags {
214            meta_tags.push_str(&format!(
215                r#"<meta name="{}" content="{}">"#,
216                escape_html(&name),
217                escape_html(&content)
218            ));
219        }
220
221        Ok(meta_tags)
222    }
223}
224
225/// Validates that a page type is not empty.
226///
227/// # Errors
228///
229/// Returns an error if the page type is empty.
230fn validate_page_type(page_type: &str) -> Result<()> {
231    if page_type.is_empty() {
232        return Err(HtmlError::seo(
233            SeoErrorKind::InvalidStructuredData,
234            "Page type cannot be empty",
235            None,
236        ));
237    }
238    Ok(())
239}
240
241/// Escapes HTML special characters in a string.
242///
243/// This function replaces special characters with their HTML entity equivalents:
244/// - `&` becomes `&amp;`
245/// - `<` becomes `&lt;`
246/// - `>` becomes `&gt;`
247/// - `"` becomes `&quot;`
248/// - `'` becomes `&#x27;`
249///
250/// # Arguments
251///
252/// * `s` - The string to escape
253///
254/// # Returns
255///
256/// Returns a `Cow<str>` containing either the original string if no escaping was
257/// needed, or a new string with escaped characters.
258///
259/// # Examples
260///
261/// ```
262/// use html_generator::seo::escape_html;
263///
264/// let input = r#"<script>alert("Hello & goodbye")</script>"#;
265/// let escaped = escape_html(input);
266/// assert_eq!(
267///     escaped,
268///     r#"&lt;script&gt;alert(&quot;Hello &amp; goodbye&quot;)&lt;/script&gt;"#
269/// );
270/// ```
271#[must_use]
272pub fn escape_html(s: &str) -> Cow<str> {
273    HTML_ESCAPES.replace_all(s, |caps: &Captures| match &caps[0] {
274        "&" => "&amp;",
275        "<" => "&lt;",
276        ">" => "&gt;",
277        "\"" => "&quot;",
278        "'" => "&#x27;",
279        _ => unreachable!("Regex only matches [&<>\"']"),
280    })
281}
282
283/// Generates meta tags for SEO purposes.
284///
285/// # Arguments
286///
287/// * `html` - The HTML content to analyze
288///
289/// # Returns
290///
291/// Returns a `Result` containing the generated meta tags as a string.
292///
293/// # Errors
294///
295/// Returns an error if:
296/// - The HTML input is too large (> 1MB)
297/// - Required elements (title, description) are missing
298///
299/// # Examples
300///
301/// ```
302/// use html_generator::seo::generate_meta_tags;
303///
304/// let html = r#"<html><head><title>Test</title></head><body><p>Content</p></body></html>"#;
305/// let meta_tags = generate_meta_tags(html)?;
306/// # Ok::<(), html_generator::error::HtmlError>(())
307/// ```
308pub fn generate_meta_tags(html: &str) -> Result<String> {
309    if html.len() > MAX_HTML_SIZE {
310        return Err(HtmlError::InputTooLarge(html.len()));
311    }
312
313    let document = Html::parse_document(html);
314    let title = extract_title(&document)?;
315    let description = extract_description(&document)?;
316
317    MetaTagsBuilder::new()
318        .with_title(title)
319        .with_description(description)
320        .build()
321}
322
323/// Generates structured data (JSON-LD) for SEO purposes.
324///
325/// # Arguments
326///
327/// * `html` - The HTML content to analyze
328/// * `config` - Optional configuration for structured data generation
329///
330/// # Returns
331///
332/// Returns a `Result` containing the generated JSON-LD script as a string.
333///
334/// # Errors
335///
336/// Returns an error if:
337/// - The HTML input is too large (> 1MB)
338/// - Required elements are missing
339/// - JSON serialization fails
340/// - Configuration validation fails
341///
342/// # Examples
343///
344/// ```
345/// use html_generator::seo::generate_structured_data;
346///
347/// let html = r#"<html><head><title>Test</title></head><body><p>Content</p></body></html>"#;
348/// let structured_data = generate_structured_data(html, None)?;
349/// # Ok::<(), html_generator::error::HtmlError>(())
350/// ```
351pub fn generate_structured_data(
352    html: &str,
353    config: Option<StructuredDataConfig>,
354) -> Result<String> {
355    if html.len() > MAX_HTML_SIZE {
356        return Err(HtmlError::InputTooLarge(html.len()));
357    }
358
359    let document = Html::parse_document(html);
360    let config = config.unwrap_or_default();
361    config.validate()?;
362
363    let title = extract_title(&document)?;
364    let description = extract_description(&document)?;
365
366    let mut json = if config.additional_types.is_empty() {
367        json!({
368            "@context": SCHEMA_ORG_CONTEXT,
369            "@type": config.page_type,
370            "name": title,
371            "description": description,
372        })
373    } else {
374        let mut types = vec![config.page_type];
375        types.extend(config.additional_types);
376        json!({
377            "@context": SCHEMA_ORG_CONTEXT,
378            "@type": types,
379            "name": title,
380            "description": description,
381        })
382    };
383
384    // Add any additional data
385    if let Some(additional_data) = config.additional_data {
386        for (key, value) in additional_data {
387            json[key] = json!(value);
388        }
389    }
390
391    Ok(format!(
392        r#"<script type="application/ld+json">
393{}
394</script>"#,
395        serde_json::to_string_pretty(&json).map_err(|e| {
396            HtmlError::InvalidStructuredData(e.to_string())
397        })?
398    ))
399}
400
401// Private helper functions
402fn extract_title(document: &Html) -> Result<String> {
403    document
404        .select(&TITLE_SELECTOR)
405        .next()
406        .map(|t| t.text().collect::<String>())
407        .ok_or_else(|| {
408            HtmlError::MissingHtmlElement("title".to_string())
409        })
410}
411
412fn extract_description(document: &Html) -> Result<String> {
413    // Try meta description first
414    if let Some(meta) = document.select(&META_DESC_SELECTOR).next() {
415        if let Some(content) = meta.value().attr("content") {
416            return Ok(content.to_string());
417        }
418    }
419
420    // Fall back to first paragraph
421    document
422        .select(&PARAGRAPH_SELECTOR)
423        .next()
424        .map(|p| p.text().collect::<String>())
425        .ok_or_else(|| {
426            HtmlError::MissingHtmlElement("description".to_string())
427        })
428}
429
430#[cfg(test)]
431mod tests {
432    use super::*;
433    use test_case::test_case as case;
434
435    /// Tests for MetaTagsBuilder functionality
436    mod meta_tags_builder {
437        use super::*;
438
439        #[test]
440        fn handles_duplicate_meta_tags() {
441            let meta_tags = MetaTagsBuilder::new()
442                .with_title("Duplicate Test")
443                .with_description("Testing duplicates")
444                .add_meta_tag("author", "John Doe")
445                .add_meta_tag("author", "Jane Doe")
446                .build()
447                .unwrap();
448
449            assert!(meta_tags.contains(r#"content="John Doe""#));
450            assert!(meta_tags.contains(r#"content="Jane Doe""#));
451        }
452
453        #[test]
454        fn handles_multiple_add_meta_tags_calls() {
455            let mut builder = MetaTagsBuilder::new()
456                .with_title("Test")
457                .with_description("Description");
458            builder = builder.add_meta_tags(vec![(
459                "key1".to_string(),
460                "value1".to_string(),
461            )]);
462            builder = builder.add_meta_tags(vec![(
463                "key2".to_string(),
464                "value2".to_string(),
465            )]);
466            let meta_tags = builder.build().unwrap();
467
468            assert!(meta_tags.contains(r#"content="value1""#));
469            assert!(meta_tags.contains(r#"content="value2""#));
470        }
471
472        #[test]
473        fn builds_basic_meta_tags() {
474            let meta_tags = MetaTagsBuilder::new()
475                .with_title("Test Title")
476                .with_description("Test Description")
477                .add_meta_tag("keywords", "test,keywords")
478                .build()
479                .unwrap();
480
481            assert!(meta_tags.contains(
482                r#"<meta name="title" content="Test Title">"#
483            ));
484            assert!(meta_tags.contains(r#"<meta name="description" content="Test Description">"#));
485            assert!(meta_tags.contains(
486                r#"<meta name="keywords" content="test,keywords">"#
487            ));
488        }
489
490        #[test]
491        fn handles_multiple_meta_tags() {
492            let tags = vec![
493                ("keywords".to_string(), "test,tags".to_string()),
494                ("robots".to_string(), "index,follow".to_string()),
495            ];
496            let meta_tags = MetaTagsBuilder::new()
497                .with_title("Test")
498                .with_description("Test")
499                .add_meta_tags(tags)
500                .build()
501                .unwrap();
502
503            assert!(
504                meta_tags.contains(r#"keywords" content="test,tags"#)
505            );
506            assert!(
507                meta_tags.contains(r#"robots" content="index,follow"#)
508            );
509        }
510
511        #[test]
512        fn fails_without_title() {
513            let result = MetaTagsBuilder::new()
514                .with_description("Test Description")
515                .build();
516
517            assert!(matches!(
518                result,
519                Err(HtmlError::Seo {
520                    kind: SeoErrorKind::MissingTitle,
521                    ..
522                })
523            ));
524        }
525
526        #[test]
527        fn fails_without_description() {
528            let result =
529                MetaTagsBuilder::new().with_title("Test Title").build();
530
531            assert!(matches!(
532                result,
533                Err(HtmlError::Seo {
534                    kind: SeoErrorKind::MissingDescription,
535                    ..
536                })
537            ));
538        }
539
540        #[test]
541        fn escapes_special_characters_in_meta_tags() {
542            let meta_tags = MetaTagsBuilder::new()
543                .with_title("Test & Title")
544                .with_description("Test < Description >")
545                .build()
546                .unwrap();
547
548            assert!(meta_tags.contains(r#"content="Test &amp; Title"#));
549            assert!(meta_tags
550                .contains(r#"content="Test &lt; Description &gt;"#));
551        }
552    }
553
554    /// Tests for HTML escaping functionality
555    mod html_escaping {
556        use super::*;
557
558        #[case("<>&\"'" => "&lt;&gt;&amp;&quot;&#x27;" ; "escapes all special characters")]
559        #[case("Normal text" => "Normal text" ; "leaves normal text unchanged")]
560        #[case("" => "" ; "handles empty string")]
561        fn escape_html_cases(input: &str) -> String {
562            escape_html(input).into_owned()
563        }
564
565        #[test]
566        fn escapes_mixed_content() {
567            let input = "Text with <tags> & \"quotes\" 'here'";
568            let expected = "Text with &lt;tags&gt; &amp; &quot;quotes&quot; &#x27;here&#x27;";
569            assert_eq!(escape_html(input), expected);
570        }
571
572        #[test]
573        fn handles_large_input() {
574            let large_input = "<>".repeat(100_000);
575            let escaped = escape_html(&large_input);
576            assert!(escaped.contains("&lt;&gt;"));
577        }
578    }
579
580    /// Tests for structured data functionality
581    mod structured_data {
582        use super::*;
583
584        #[test]
585        fn handles_deeply_nested_configuration() {
586            let html = r"<html><head><title>Nested Test</title></head><body><p>Description</p></body></html>";
587            let mut additional_data = HashMap::new();
588            _ = additional_data
589                .insert("level1".to_string(), "value1".to_string());
590            _ = additional_data
591                .insert("level2".to_string(), "value2".to_string());
592
593            let config = StructuredDataConfig {
594                page_type: "TestType".to_string(),
595                additional_types: vec!["ExtraType".to_string()],
596                additional_data: Some(additional_data),
597            };
598
599            let result =
600                generate_structured_data(html, Some(config)).unwrap();
601            let json_content = extract_json_from_script(&result);
602            let parsed: serde_json::Value =
603                serde_json::from_str(&json_content).unwrap();
604
605            assert_eq!(
606                parsed["@type"],
607                serde_json::json!(["TestType", "ExtraType"])
608            );
609            assert_eq!(parsed["level1"], "value1");
610            assert_eq!(parsed["level2"], "value2");
611        }
612
613        #[test]
614        fn generates_basic_structured_data() {
615            let html = r"<html><head><title>Test</title></head><body><p>Description</p></body></html>";
616            let result = generate_structured_data(html, None).unwrap();
617
618            let json_content = extract_json_from_script(&result);
619            let parsed: serde_json::Value =
620                serde_json::from_str(&json_content).unwrap();
621
622            assert_eq!(parsed["@type"], "WebPage");
623            assert_eq!(parsed["name"], "Test");
624            assert_eq!(parsed["description"], "Description");
625        }
626
627        #[test]
628        fn generates_multiple_types() {
629            let html = r"<html><head><title>Test</title></head><body><p>Description</p></body></html>";
630            let config = StructuredDataConfig {
631                page_type: "Article".to_string(),
632                additional_types: vec!["WebPage".to_string()],
633                additional_data: Some(HashMap::from([(
634                    "author".to_string(),
635                    "Test Author".to_string(),
636                )])),
637            };
638
639            let result =
640                generate_structured_data(html, Some(config)).unwrap();
641            let json_content = extract_json_from_script(&result);
642            let parsed: serde_json::Value =
643                serde_json::from_str(&json_content).unwrap();
644
645            assert_eq!(
646                parsed["@type"],
647                serde_json::json!(["Article", "WebPage"]),
648                "Expected @type to include multiple types"
649            );
650            assert_eq!(
651                parsed["author"], "Test Author",
652                "Expected author to be included"
653            );
654        }
655
656        #[test]
657        fn validates_config() {
658            let empty_type = StructuredDataConfig {
659                page_type: "".to_string(),
660                ..Default::default()
661            };
662            assert!(empty_type.validate().is_err());
663
664            let empty_additional = StructuredDataConfig {
665                additional_types: vec!["".to_string()],
666                ..Default::default()
667            };
668            assert!(empty_additional.validate().is_err());
669        }
670
671        /// Helper function to extract JSON content from script tags
672        fn extract_json_from_script(script: &str) -> String {
673            let json_start =
674                script.find('{').expect("JSON should start with '{'");
675            let json_end =
676                script.rfind('}').expect("JSON should end with '}'");
677            script[json_start..=json_end].to_string()
678        }
679    }
680
681    /// Tests for input validation and limits
682    mod input_validation {
683        use super::*;
684
685        #[test]
686        fn enforces_size_limit_for_meta_tags() {
687            let large_html = "a".repeat(MAX_HTML_SIZE + 1);
688            assert!(matches!(
689                generate_meta_tags(&large_html),
690                Err(HtmlError::InputTooLarge(_))
691            ));
692        }
693
694        #[test]
695        fn enforces_size_limit_for_structured_data() {
696            let large_html = "a".repeat(MAX_HTML_SIZE + 1);
697            assert!(matches!(
698                generate_structured_data(&large_html, None),
699                Err(HtmlError::InputTooLarge(_))
700            ));
701        }
702
703        #[test]
704        fn handles_missing_title() {
705            let html =
706                r"<html><body><p>No title here</p></body></html>";
707            assert!(matches!(
708                generate_meta_tags(html),
709                Err(HtmlError::MissingHtmlElement(ref e)) if e == "title"
710            ));
711        }
712
713        #[test]
714        fn handles_missing_description() {
715            let html =
716                r"<html><head><title>Title only</title></head></html>";
717            assert!(matches!(
718                generate_meta_tags(html),
719                Err(HtmlError::MissingHtmlElement(ref e)) if e == "description"
720            ));
721        }
722
723        #[test]
724        fn invalid_additional_data_keys() {
725            let mut additional_data = HashMap::new();
726            _ = additional_data
727                .insert("<invalid>".to_string(), "value".to_string());
728            let config = StructuredDataConfig {
729                additional_data: Some(additional_data),
730                ..Default::default()
731            };
732            let result =
733                generate_structured_data("<html></html>", Some(config));
734            assert!(result.is_err());
735        }
736    }
737}