blocks 0.1.0

A high-performance Rust library for block-based content editing with JSON, Markdown, and HTML support
Documentation
/// Security module for content sanitization
///
/// This module provides comprehensive sanitization functions to prevent
/// XSS (Cross-Site Scripting) attacks and other security vulnerabilities.
use crate::error::{BlocksError, Result};
use regex::Regex;
use std::collections::HashSet;

/// Whitelist of allowed HTML tags
const ALLOWED_TAGS: &[&str] = &[
    "p",
    "br",
    "strong",
    "em",
    "u",
    "s",
    "code",
    "pre",
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "h6",
    "ul",
    "ol",
    "li",
    "blockquote",
    "a",
    "img",
    "table",
    "tr",
    "td",
    "th",
    "thead",
    "tbody",
    "span",
    "div",
    "section",
    "article",
    "header",
    "footer",
    "nav",
];

/// Whitelist of allowed attributes
const ALLOWED_ATTRIBUTES: &[&str] = &[
    "href",
    "src",
    "alt",
    "title",
    "class",
    "id",
    "data-block-id",
    "data-type",
    "width",
    "height",
    "loading",
];

/// Dangerous keywords that indicate potential XSS
const DANGEROUS_KEYWORDS: &[&str] = &[
    "javascript:",
    "data:",
    "vbscript:",
    "onerror",
    "onload",
    "onclick",
    "onmouseover",
    "onfocus",
    "onblur",
    "onchange",
    "onsubmit",
    "onkeydown",
    "onkeyup",
];

/// Content sanitizer for XSS prevention
pub struct ContentSanitizer {
    allowed_tags: HashSet<String>,
    allowed_attributes: HashSet<String>,
    strict_mode: bool,
}

impl ContentSanitizer {
    /// Creates a new content sanitizer with default settings
    pub fn new() -> Self {
        Self {
            allowed_tags: ALLOWED_TAGS.iter().map(|s| s.to_string()).collect(),
            allowed_attributes: ALLOWED_ATTRIBUTES.iter().map(|s| s.to_string()).collect(),
            strict_mode: false,
        }
    }

    /// Creates a new sanitizer in strict mode
    /// Strict mode removes all HTML and keeps only plain text
    pub fn strict() -> Self {
        Self {
            allowed_tags: HashSet::new(),
            allowed_attributes: HashSet::new(),
            strict_mode: true,
        }
    }

    /// Sanitizes HTML content to prevent XSS attacks
    ///
    /// # Arguments
    ///
    /// * `content` - The HTML content to sanitize
    ///
    /// # Returns
    ///
    /// `Result<String>` - Sanitized HTML content
    ///
    /// # Example
    ///
    /// ```rust
    /// use blocks::sanitizer::ContentSanitizer;
    ///
    /// let sanitizer = ContentSanitizer::new();
    /// let dangerous = r#"<p>Hello</p><script>alert('xss')</script>"#;
    /// let safe = sanitizer.sanitize(dangerous).unwrap();
    /// assert!(!safe.contains("script"));
    /// ```
    pub fn sanitize(&self, content: &str) -> Result<String> {
        if self.strict_mode {
            return Ok(self.strip_html(content));
        }

        let mut result = content.to_string();

        // Remove dangerous keywords
        for keyword in DANGEROUS_KEYWORDS {
            result = self.remove_dangerous_pattern(keyword, &result)?;
        }

        // Remove script tags and their content
        result = self.remove_script_tags(&result);

        // Remove iframe tags (potential security risk)
        result = self.remove_tags("iframe", &result);

        // Remove style tags
        result = self.remove_tags("style", &result);

        // Remove event handlers
        result = self.remove_event_handlers(&result);

        // Clean attributes
        result = self.clean_attributes(&result);

        Ok(result)
    }

    /// Sanitizes plain text content
    pub fn sanitize_text(&self, content: &str) -> Result<String> {
        // HTML escape and remove any HTML tags
        let escaped = html_escape::encode_text(content).to_string();
        Ok(escaped)
    }

    /// Removes dangerous patterns from content
    fn remove_dangerous_pattern(&self, pattern: &str, content: &str) -> Result<String> {
        let regex = Regex::new(&format!(r"(?i){}", regex::escape(pattern))).map_err(|e| {
            BlocksError::CssError {
                reason: format!("Regex error: {}", e),
            }
        })?;

        Ok(regex.replace_all(content, "").to_string())
    }

    /// Removes script tags and their content
    fn remove_script_tags(&self, content: &str) -> String {
        let regex = Regex::new(r"(?i)<script[^>]*>.*?</script>")
            .unwrap_or_else(|_| Regex::new(r"<>").unwrap());
        regex.replace_all(content, "").to_string()
    }

    /// Removes specified HTML tags
    fn remove_tags(&self, tag: &str, content: &str) -> String {
        let regex = Regex::new(&format!(
            r"(?i)<{tag}[^>]*>.*?</{tag}>",
            tag = regex::escape(tag)
        ))
        .unwrap_or_else(|_| Regex::new(r"<>").unwrap());
        regex.replace_all(content, "").to_string()
    }

    /// Removes event handler attributes
    fn remove_event_handlers(&self, content: &str) -> String {
        // Remove event handler attributes with their values
        let regex = Regex::new(r#"\s+on[a-z]+\s*=\s*(?:"[^"]*"|'[^']*'|[^\s>]*)?"#)
            .unwrap_or_else(|_| Regex::new(r"<>").unwrap());
        regex.replace_all(content, "").to_string()
    }

    /// Cleans unsafe attributes from tags
    fn clean_attributes(&self, content: &str) -> String {
        let mut result = content.to_string();

        // Remove javascript: protocol in href
        result = Regex::new(r#"href\s*=\s*"javascript:[^"]*""#)
            .map(|r| r.replace_all(&result, r#"href=""#).to_string())
            .unwrap_or(result);

        // Remove data: protocol in src (can contain scripts)
        result = Regex::new(r#"src\s*=\s*"data:[^"]*""#)
            .map(|r| r.replace_all(&result, r#"src=""#).to_string())
            .unwrap_or(result);

        result
    }

    /// Strips all HTML tags, keeping only text
    fn strip_html(&self, content: &str) -> String {
        let regex = Regex::new(r"<[^>]*>").unwrap_or_else(|_| Regex::new(r"<>").unwrap());
        regex.replace_all(content, "").to_string()
    }

    /// Validates a URL to ensure it's safe
    ///
    /// # Arguments
    ///
    /// * `url` - The URL to validate
    ///
    /// # Returns
    ///
    /// `Result<()>` - Ok if URL is safe, Err otherwise
    pub fn validate_url(&self, url: &str) -> Result<()> {
        if url.is_empty() {
            return Ok(());
        }

        // Check for dangerous protocols
        for keyword in DANGEROUS_KEYWORDS {
            if url.to_lowercase().starts_with(keyword) {
                return Err(BlocksError::ValidationError {
                    message: format!("Dangerous URL protocol detected: {}", keyword),
                });
            }
        }

        // Check for common XSS patterns in URLs
        if url.contains("javascript:") || url.contains("data:text/html") {
            return Err(BlocksError::ValidationError {
                message: "URL contains potentially dangerous content".to_string(),
            });
        }

        Ok(())
    }
}

impl Default for ContentSanitizer {
    fn default() -> Self {
        Self::new()
    }
}

impl ContentSanitizer {
    /// Returns true if the given tag is allowed
    pub fn is_tag_allowed(&self, tag: &str) -> bool {
        self.allowed_tags.contains(&tag.to_lowercase())
    }

    /// Returns true if the given attribute is allowed
    pub fn is_attribute_allowed(&self, attr: &str) -> bool {
        self.allowed_attributes.contains(&attr.to_lowercase())
    }

    /// Adds a custom allowed tag
    pub fn allow_tag(&mut self, tag: &str) {
        self.allowed_tags.insert(tag.to_lowercase());
    }

    /// Adds a custom allowed attribute
    pub fn allow_attribute(&mut self, attr: &str) {
        self.allowed_attributes.insert(attr.to_lowercase());
    }

    /// Removes a tag from the allowed list
    pub fn disallow_tag(&mut self, tag: &str) {
        self.allowed_tags.remove(&tag.to_lowercase());
    }

    /// Removes an attribute from the allowed list
    pub fn disallow_attribute(&mut self, attr: &str) {
        self.allowed_attributes.remove(&attr.to_lowercase());
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_remove_script_tags() {
        let sanitizer = ContentSanitizer::new();
        let dangerous = r#"<p>Hello</p><script>alert('xss')</script><p>World</p>"#;
        let safe = sanitizer.sanitize(dangerous).unwrap();
        assert!(!safe.contains("script"));
        assert!(safe.contains("Hello"));
        assert!(safe.contains("World"));
    }

    #[test]
    fn test_remove_event_handlers() {
        let sanitizer = ContentSanitizer::new();
        let dangerous = r#"<p onclick="alert('xss')">Click me</p>"#;
        let safe = sanitizer.sanitize(dangerous).unwrap();
        assert!(!safe.contains("onclick"));
        assert!(!safe.to_lowercase().contains("onclick="));
    }

    #[test]
    fn test_javascript_protocol() {
        let sanitizer = ContentSanitizer::new();
        let dangerous = r#"<a href="javascript:alert('xss')">Click</a>"#;
        let safe = sanitizer.sanitize(dangerous).unwrap();
        assert!(!safe.to_lowercase().contains("javascript:"));
    }

    #[test]
    fn test_strict_mode() {
        let sanitizer = ContentSanitizer::strict();
        let html = "<p>Hello <b>World</b></p>";
        let result = sanitizer.sanitize(html).unwrap();
        assert_eq!(result, "Hello World");
    }

    #[test]
    fn test_validate_safe_url() {
        let sanitizer = ContentSanitizer::new();
        assert!(sanitizer.validate_url("https://example.com").is_ok());
        assert!(sanitizer.validate_url("http://example.com/path").is_ok());
        assert!(sanitizer.validate_url("/relative/path").is_ok());
    }

    #[test]
    fn test_validate_dangerous_url() {
        let sanitizer = ContentSanitizer::new();
        assert!(sanitizer.validate_url("javascript:alert('xss')").is_err());
        assert!(sanitizer
            .validate_url("data:text/html,<script>alert('xss')</script>")
            .is_err());
    }

    #[test]
    fn test_sanitize_text() {
        let sanitizer = ContentSanitizer::new();
        let text = "<script>alert('xss')</script>";
        let safe = sanitizer.sanitize_text(text).unwrap();
        assert!(!safe.contains('<'));
        assert!(!safe.contains('>'));
    }
}