reasonkit_web/handlers/
capture.rs

1//! Capture endpoint handler for the ReasonKit Web MCP server
2//!
3//! This module provides the `/capture` HTTP endpoint for receiving DOM content
4//! from browser extensions or other sources, processing it, and storing it
5//! in a bounded memory buffer for later retrieval.
6//!
7//! # Features
8//!
9//! - URL format validation
10//! - Content length limits (configurable)
11//! - HTML entity decoding
12//! - Script/style tag removal
13//! - Whitespace normalization
14//! - Metrics tracking (capture count, processing time)
15//!
16//! # Error Handling
17//!
18//! - `400 Bad Request` - Invalid request (malformed URL, missing fields)
19//! - `413 Payload Too Large` - Content exceeds maximum allowed size
20//! - `500 Internal Server Error` - Processing or storage errors
21
22use axum::{
23    extract::State,
24    http::StatusCode,
25    response::{IntoResponse, Response},
26    Json,
27};
28use chrono::{DateTime, Utc};
29use metrics::{counter, histogram};
30use regex::Regex;
31use serde::{Deserialize, Serialize};
32use std::sync::Arc;
33use std::time::Instant;
34use tokio::sync::mpsc;
35use tracing::{debug, error, info, instrument, warn};
36use url::Url;
37use uuid::Uuid;
38
39// ============================================================================
40// Configuration
41// ============================================================================
42
43/// Default maximum content length in bytes (10 MB)
44pub const DEFAULT_MAX_CONTENT_LENGTH: usize = 10 * 1024 * 1024;
45
46/// Default buffer channel capacity
47pub const DEFAULT_BUFFER_CAPACITY: usize = 1000;
48
49/// Configuration for the capture handler
50#[derive(Debug, Clone)]
51pub struct CaptureConfig {
52    /// Maximum allowed content length in bytes
53    pub max_content_length: usize,
54    /// Maximum length after truncation (if content exceeds max)
55    pub truncate_length: Option<usize>,
56    /// Whether to strip script tags
57    pub strip_scripts: bool,
58    /// Whether to strip style tags
59    pub strip_styles: bool,
60    /// Whether to decode HTML entities
61    pub decode_entities: bool,
62    /// Whether to normalize whitespace
63    pub normalize_whitespace: bool,
64}
65
66impl Default for CaptureConfig {
67    fn default() -> Self {
68        Self {
69            max_content_length: DEFAULT_MAX_CONTENT_LENGTH,
70            truncate_length: Some(1024 * 1024), // 1 MB after processing
71            strip_scripts: true,
72            strip_styles: true,
73            decode_entities: true,
74            normalize_whitespace: true,
75        }
76    }
77}
78
79// ============================================================================
80// Request/Response Types
81// ============================================================================
82
83/// Request body for the `/capture` endpoint
84#[derive(Debug, Clone, Deserialize)]
85pub struct CaptureRequest {
86    /// The URL of the captured page
87    pub url: String,
88
89    /// The DOM content (HTML)
90    pub content: String,
91
92    /// Optional title of the page
93    #[serde(default)]
94    pub title: Option<String>,
95
96    /// Optional description/meta description
97    #[serde(default)]
98    pub description: Option<String>,
99
100    /// Optional timestamp of capture (defaults to server time)
101    #[serde(default)]
102    pub captured_at: Option<DateTime<Utc>>,
103
104    /// Optional metadata as key-value pairs
105    #[serde(default)]
106    pub metadata: Option<serde_json::Value>,
107}
108
109/// Response body for successful capture
110#[derive(Debug, Clone, Serialize)]
111pub struct CaptureResponse {
112    /// Unique identifier for this capture
113    pub id: Uuid,
114
115    /// The URL that was captured
116    pub url: String,
117
118    /// Timestamp when the capture was processed
119    pub processed_at: DateTime<Utc>,
120
121    /// Size of the original content in bytes
122    pub original_size: usize,
123
124    /// Size of the processed content in bytes
125    pub processed_size: usize,
126
127    /// Whether the content was truncated
128    pub truncated: bool,
129
130    /// Processing duration in milliseconds
131    pub processing_time_ms: u64,
132}
133
134/// Processed capture data stored in the buffer
135#[derive(Debug, Clone, Serialize)]
136pub struct ProcessedCapture {
137    /// Unique identifier
138    pub id: Uuid,
139
140    /// Original URL
141    pub url: String,
142
143    /// Page title
144    pub title: Option<String>,
145
146    /// Page description
147    pub description: Option<String>,
148
149    /// Processed text content
150    pub content: String,
151
152    /// Original HTML (if preserved)
153    pub original_html: Option<String>,
154
155    /// Capture timestamp
156    pub captured_at: DateTime<Utc>,
157
158    /// Processing timestamp
159    pub processed_at: DateTime<Utc>,
160
161    /// Additional metadata
162    pub metadata: Option<serde_json::Value>,
163}
164
165// ============================================================================
166// Error Types
167// ============================================================================
168
169/// Errors that can occur during capture processing
170#[derive(Debug, thiserror::Error)]
171pub enum CaptureError {
172    /// Invalid request (missing fields, malformed data)
173    #[error("Invalid request: {0}")]
174    InvalidRequest(String),
175
176    /// Invalid URL format
177    #[error("Invalid URL: {0}")]
178    InvalidUrl(String),
179
180    /// Content exceeds maximum allowed size
181    #[error("Content too large: {size} bytes exceeds maximum of {max} bytes")]
182    ContentTooLarge {
183        /// Actual content size in bytes
184        size: usize,
185        /// Maximum allowed size in bytes
186        max: usize,
187    },
188
189    /// Content processing failed
190    #[error("Processing error: {0}")]
191    ProcessingError(String),
192
193    /// Failed to store capture in buffer
194    #[error("Storage error: {0}")]
195    StorageError(String),
196
197    /// Internal server error
198    #[error("Internal error: {0}")]
199    InternalError(String),
200}
201
202impl IntoResponse for CaptureError {
203    fn into_response(self) -> Response {
204        let (status, error_type, message) = match &self {
205            CaptureError::InvalidRequest(msg) => {
206                (StatusCode::BAD_REQUEST, "invalid_request", msg.clone())
207            }
208            CaptureError::InvalidUrl(msg) => (StatusCode::BAD_REQUEST, "invalid_url", msg.clone()),
209            CaptureError::ContentTooLarge { size, max } => (
210                StatusCode::PAYLOAD_TOO_LARGE,
211                "content_too_large",
212                format!("Content size {} exceeds maximum {}", size, max),
213            ),
214            CaptureError::ProcessingError(msg) => (
215                StatusCode::INTERNAL_SERVER_ERROR,
216                "processing_error",
217                msg.clone(),
218            ),
219            CaptureError::StorageError(msg) => (
220                StatusCode::INTERNAL_SERVER_ERROR,
221                "storage_error",
222                msg.clone(),
223            ),
224            CaptureError::InternalError(msg) => (
225                StatusCode::INTERNAL_SERVER_ERROR,
226                "internal_error",
227                msg.clone(),
228            ),
229        };
230
231        // Increment error counter
232        counter!("capture_errors_total", "type" => error_type).increment(1);
233
234        let body = serde_json::json!({
235            "error": {
236                "type": error_type,
237                "message": message,
238            }
239        });
240
241        (status, Json(body)).into_response()
242    }
243}
244
245// ============================================================================
246// Shared State
247// ============================================================================
248
249/// Shared state for the capture handler
250#[derive(Clone)]
251pub struct CaptureState {
252    /// Configuration
253    pub config: CaptureConfig,
254    /// Channel sender for storing captures
255    pub sender: mpsc::Sender<ProcessedCapture>,
256}
257
258impl CaptureState {
259    /// Create a new capture state with the given configuration
260    pub fn new(config: CaptureConfig, sender: mpsc::Sender<ProcessedCapture>) -> Self {
261        Self { config, sender }
262    }
263
264    /// Create a new capture state with default configuration
265    pub fn with_defaults(sender: mpsc::Sender<ProcessedCapture>) -> Self {
266        Self::new(CaptureConfig::default(), sender)
267    }
268}
269
270/// Create a new bounded channel for capture storage
271pub fn create_capture_buffer(
272    capacity: usize,
273) -> (
274    mpsc::Sender<ProcessedCapture>,
275    mpsc::Receiver<ProcessedCapture>,
276) {
277    mpsc::channel(capacity)
278}
279
280// ============================================================================
281// Content Processing
282// ============================================================================
283
284/// Content processor for cleaning and normalizing HTML
285pub struct ContentProcessor {
286    /// Compiled regex for script tags
287    script_regex: Regex,
288    /// Compiled regex for style tags
289    style_regex: Regex,
290    /// Compiled regex for HTML tags
291    tag_regex: Regex,
292    /// Compiled regex for multiple whitespace
293    whitespace_regex: Regex,
294    /// Compiled regex for multiple newlines
295    newline_regex: Regex,
296}
297
298impl ContentProcessor {
299    /// Create a new content processor with pre-compiled regexes
300    pub fn new() -> Self {
301        Self {
302            script_regex: Regex::new(r"(?is)<script[^>]*>[\s\S]*?</script>").unwrap(),
303            style_regex: Regex::new(r"(?is)<style[^>]*>[\s\S]*?</style>").unwrap(),
304            tag_regex: Regex::new(r"<[^>]+>").unwrap(),
305            whitespace_regex: Regex::new(r"[ \t]+").unwrap(),
306            newline_regex: Regex::new(r"\n{3,}").unwrap(),
307        }
308    }
309
310    /// Process HTML content according to the given configuration
311    #[instrument(skip(self, html, config))]
312    pub fn process(&self, html: &str, config: &CaptureConfig) -> Result<String, CaptureError> {
313        let mut content = html.to_string();
314
315        // Strip script tags if configured
316        if config.strip_scripts {
317            content = self.script_regex.replace_all(&content, "").to_string();
318            debug!("Stripped script tags");
319        }
320
321        // Strip style tags if configured
322        if config.strip_styles {
323            content = self.style_regex.replace_all(&content, "").to_string();
324            debug!("Stripped style tags");
325        }
326
327        // Replace block elements with newlines for better text extraction
328        content = content
329            .replace("</p>", "\n")
330            .replace("</div>", "\n")
331            .replace("</li>", "\n")
332            .replace("</tr>", "\n")
333            .replace("<br>", "\n")
334            .replace("<br/>", "\n")
335            .replace("<br />", "\n");
336
337        // Strip all remaining HTML tags
338        content = self.tag_regex.replace_all(&content, "").to_string();
339
340        // Decode HTML entities if configured
341        if config.decode_entities {
342            content = Self::decode_html_entities(&content);
343            debug!("Decoded HTML entities");
344        }
345
346        // Normalize whitespace if configured
347        if config.normalize_whitespace {
348            // Replace multiple spaces/tabs with single space
349            content = self.whitespace_regex.replace_all(&content, " ").to_string();
350            // Replace multiple newlines with double newline
351            content = self.newline_regex.replace_all(&content, "\n\n").to_string();
352            // Trim each line
353            content = content
354                .lines()
355                .map(|l| l.trim())
356                .collect::<Vec<_>>()
357                .join("\n");
358            debug!("Normalized whitespace");
359        }
360
361        // Final trim
362        content = content.trim().to_string();
363
364        Ok(content)
365    }
366
367    /// Decode common HTML entities
368    fn decode_html_entities(text: &str) -> String {
369        // Use htmlescape crate for comprehensive decoding, with fallback
370        match htmlescape::decode_html(text) {
371            Ok(decoded) => decoded,
372            Err(_) => {
373                // Fallback to manual decoding for common entities
374                text.replace("&nbsp;", " ")
375                    .replace("&lt;", "<")
376                    .replace("&gt;", ">")
377                    .replace("&amp;", "&")
378                    .replace("&quot;", "\"")
379                    .replace("&#39;", "'")
380                    .replace("&apos;", "'")
381                    .replace("&mdash;", "\u{2014}")
382                    .replace("&ndash;", "\u{2013}")
383                    .replace("&hellip;", "\u{2026}")
384                    .replace("&lsquo;", "\u{2018}")
385                    .replace("&rsquo;", "\u{2019}")
386                    .replace("&ldquo;", "\u{201C}")
387                    .replace("&rdquo;", "\u{201D}")
388                    .replace("&copy;", "\u{00A9}")
389                    .replace("&reg;", "\u{00AE}")
390                    .replace("&trade;", "\u{2122}")
391            }
392        }
393    }
394}
395
396impl Default for ContentProcessor {
397    fn default() -> Self {
398        Self::new()
399    }
400}
401
402// ============================================================================
403// URL Validation
404// ============================================================================
405
406/// Validate a URL string
407fn validate_url(url_str: &str) -> Result<Url, CaptureError> {
408    // Check for empty URL
409    if url_str.is_empty() {
410        return Err(CaptureError::InvalidUrl("URL cannot be empty".to_string()));
411    }
412
413    // Parse the URL
414    let url = Url::parse(url_str)
415        .map_err(|e| CaptureError::InvalidUrl(format!("Failed to parse URL: {}", e)))?;
416
417    // Validate scheme
418    match url.scheme() {
419        "http" | "https" => {}
420        scheme => {
421            return Err(CaptureError::InvalidUrl(format!(
422                "Invalid URL scheme '{}': only http and https are allowed",
423                scheme
424            )));
425        }
426    }
427
428    // Validate host
429    if url.host().is_none() {
430        return Err(CaptureError::InvalidUrl(
431            "URL must have a valid host".to_string(),
432        ));
433    }
434
435    Ok(url)
436}
437
438// ============================================================================
439// Handler Implementation
440// ============================================================================
441
442/// Handle the `/capture` POST endpoint
443///
444/// This endpoint accepts JSON payload with DOM content, processes it,
445/// stores it in the memory buffer, and returns a UUID for tracking.
446///
447/// # Request Body
448///
449/// ```json
450/// {
451///     "url": "https://example.com/page",
452///     "content": "<html>...</html>",
453///     "title": "Page Title",
454///     "description": "Page description",
455///     "captured_at": "2024-01-01T00:00:00Z",
456///     "metadata": { "key": "value" }
457/// }
458/// ```
459///
460/// # Response
461///
462/// ```json
463/// {
464///     "id": "550e8400-e29b-41d4-a716-446655440000",
465///     "url": "https://example.com/page",
466///     "processed_at": "2024-01-01T00:00:01Z",
467///     "original_size": 10240,
468///     "processed_size": 5120,
469///     "truncated": false,
470///     "processing_time_ms": 15
471/// }
472/// ```
473///
474/// # Errors
475///
476/// - `400 Bad Request` - Invalid URL format or missing required fields
477/// - `413 Payload Too Large` - Content exceeds maximum allowed size
478/// - `500 Internal Server Error` - Processing or storage errors
479#[instrument(skip(state, request), fields(url = %request.url))]
480pub async fn capture_handler(
481    State(state): State<Arc<CaptureState>>,
482    Json(request): Json<CaptureRequest>,
483) -> Result<Json<CaptureResponse>, CaptureError> {
484    let start_time = Instant::now();
485    info!("Processing capture request for URL: {}", request.url);
486
487    // Validate URL format
488    let validated_url = validate_url(&request.url)?;
489    debug!("URL validated: {}", validated_url);
490
491    // Check content length
492    let original_size = request.content.len();
493    if original_size > state.config.max_content_length {
494        warn!(
495            "Content too large: {} bytes (max: {})",
496            original_size, state.config.max_content_length
497        );
498        return Err(CaptureError::ContentTooLarge {
499            size: original_size,
500            max: state.config.max_content_length,
501        });
502    }
503
504    // Process content
505    let processor = ContentProcessor::new();
506    let processed_content = processor.process(&request.content, &state.config)?;
507
508    // Check for truncation
509    let (final_content, truncated) = match state.config.truncate_length {
510        Some(max_len) if processed_content.len() > max_len => {
511            // Truncate at word boundary if possible
512            let truncated_content =
513                if let Some(last_space) = processed_content[..max_len].rfind(' ') {
514                    format!("{}...", &processed_content[..last_space])
515                } else {
516                    format!("{}...", &processed_content[..max_len])
517                };
518            info!(
519                "Content truncated from {} to {} bytes",
520                processed_content.len(),
521                truncated_content.len()
522            );
523            (truncated_content, true)
524        }
525        _ => (processed_content, false),
526    };
527
528    let processed_size = final_content.len();
529
530    // Generate UUID
531    let id = Uuid::new_v4();
532    let now = Utc::now();
533
534    // Create processed capture
535    let capture = ProcessedCapture {
536        id,
537        url: validated_url.to_string(),
538        title: request.title,
539        description: request.description,
540        content: final_content,
541        original_html: None, // Don't store original to save memory
542        captured_at: request.captured_at.unwrap_or(now),
543        processed_at: now,
544        metadata: request.metadata,
545    };
546
547    // Store in buffer
548    state.sender.send(capture).await.map_err(|e| {
549        error!("Failed to store capture in buffer: {}", e);
550        CaptureError::StorageError(format!("Buffer full or closed: {}", e))
551    })?;
552
553    let processing_time = start_time.elapsed();
554    let processing_time_ms = processing_time.as_millis() as u64;
555
556    // Record metrics
557    counter!("captures_total").increment(1);
558    histogram!("capture_processing_time_seconds").record(processing_time.as_secs_f64());
559    histogram!("capture_original_size_bytes").record(original_size as f64);
560    histogram!("capture_processed_size_bytes").record(processed_size as f64);
561
562    if truncated {
563        counter!("captures_truncated_total").increment(1);
564    }
565
566    info!(
567        "Capture processed successfully: id={}, original_size={}, processed_size={}, time={}ms",
568        id, original_size, processed_size, processing_time_ms
569    );
570
571    Ok(Json(CaptureResponse {
572        id,
573        url: validated_url.to_string(),
574        processed_at: now,
575        original_size,
576        processed_size,
577        truncated,
578        processing_time_ms,
579    }))
580}
581
582/// Health check endpoint for the capture service
583pub async fn capture_health() -> impl IntoResponse {
584    Json(serde_json::json!({
585        "status": "healthy",
586        "service": "capture",
587        "timestamp": Utc::now().to_rfc3339()
588    }))
589}
590
591// ============================================================================
592// Router Configuration
593// ============================================================================
594
595use axum::{routing::post, Router};
596
597/// Create the capture router with all endpoints
598pub fn capture_router(state: Arc<CaptureState>) -> Router {
599    Router::new()
600        .route("/capture", post(capture_handler))
601        .route("/capture/health", axum::routing::get(capture_health))
602        .with_state(state)
603}
604
605// ============================================================================
606// Tests
607// ============================================================================
608
609#[cfg(test)]
610mod tests {
611    use super::*;
612
613    #[test]
614    fn test_validate_url_valid_http() {
615        let result = validate_url("http://example.com/page");
616        assert!(result.is_ok());
617    }
618
619    #[test]
620    fn test_validate_url_valid_https() {
621        let result = validate_url("https://example.com/page?query=1");
622        assert!(result.is_ok());
623    }
624
625    #[test]
626    fn test_validate_url_empty() {
627        let result = validate_url("");
628        assert!(matches!(result, Err(CaptureError::InvalidUrl(_))));
629    }
630
631    #[test]
632    fn test_validate_url_invalid_scheme() {
633        let result = validate_url("ftp://example.com/file");
634        assert!(matches!(result, Err(CaptureError::InvalidUrl(_))));
635    }
636
637    #[test]
638    fn test_validate_url_no_host() {
639        // Note: For http/https URLs, the url crate always interprets something as a host.
640        // "http:///path" is parsed with "path" as the domain and "/" as the path.
641        // Testing a truly hostless URL requires a non-http scheme which fails the scheme check.
642        // This test verifies invalid schemes are rejected:
643        let result = validate_url("file:///path");
644        assert!(matches!(result, Err(CaptureError::InvalidUrl(_))));
645
646        // Also verify that IP-like invalid hosts still parse as domains:
647        let result = validate_url("http:///");
648        // The url crate treats this as host="" which becomes None for http
649        // but may vary - we just ensure no panics and proper error handling
650        assert!(result.is_err() || result.unwrap().host().is_some());
651    }
652
653    #[test]
654    fn test_validate_url_malformed() {
655        let result = validate_url("not a url");
656        assert!(matches!(result, Err(CaptureError::InvalidUrl(_))));
657    }
658
659    #[test]
660    fn test_content_processor_strips_scripts() {
661        let processor = ContentProcessor::new();
662        let config = CaptureConfig::default();
663
664        let html = "<p>Hello</p><script>evil();</script><p>World</p>";
665        let result = processor.process(html, &config).unwrap();
666
667        assert!(!result.contains("script"));
668        assert!(!result.contains("evil"));
669        assert!(result.contains("Hello"));
670        assert!(result.contains("World"));
671    }
672
673    #[test]
674    fn test_content_processor_strips_styles() {
675        let processor = ContentProcessor::new();
676        let config = CaptureConfig::default();
677
678        let html = "<p>Content</p><style>.hidden { display: none; }</style>";
679        let result = processor.process(html, &config).unwrap();
680
681        assert!(!result.contains("style"));
682        assert!(!result.contains("display"));
683        assert!(result.contains("Content"));
684    }
685
686    #[test]
687    fn test_content_processor_decodes_entities() {
688        let processor = ContentProcessor::new();
689        let config = CaptureConfig::default();
690
691        let html = "<p>Hello &amp; World &lt;test&gt;</p>";
692        let result = processor.process(html, &config).unwrap();
693
694        assert!(result.contains("Hello & World <test>"));
695    }
696
697    #[test]
698    fn test_content_processor_normalizes_whitespace() {
699        let processor = ContentProcessor::new();
700        let config = CaptureConfig::default();
701
702        let html = "<p>Hello    World</p>\n\n\n\n<p>Next</p>";
703        let result = processor.process(html, &config).unwrap();
704
705        // Should not have excessive whitespace
706        assert!(!result.contains("    "));
707        assert!(!result.contains("\n\n\n"));
708    }
709
710    #[test]
711    fn test_content_processor_strips_tags() {
712        let processor = ContentProcessor::new();
713        let config = CaptureConfig::default();
714
715        let html = "<div class=\"container\"><p>Text</p></div>";
716        let result = processor.process(html, &config).unwrap();
717
718        assert!(!result.contains("<"));
719        assert!(!result.contains(">"));
720        assert!(result.contains("Text"));
721    }
722
723    #[test]
724    fn test_capture_request_deserialization() {
725        let json = r#"{
726            "url": "https://example.com",
727            "content": "<p>Hello</p>",
728            "title": "Test Page"
729        }"#;
730
731        let request: CaptureRequest = serde_json::from_str(json).unwrap();
732        assert_eq!(request.url, "https://example.com");
733        assert_eq!(request.content, "<p>Hello</p>");
734        assert_eq!(request.title, Some("Test Page".to_string()));
735        assert!(request.description.is_none());
736    }
737
738    #[test]
739    fn test_capture_response_serialization() {
740        let response = CaptureResponse {
741            id: Uuid::new_v4(),
742            url: "https://example.com".to_string(),
743            processed_at: Utc::now(),
744            original_size: 1000,
745            processed_size: 500,
746            truncated: false,
747            processing_time_ms: 10,
748        };
749
750        let json = serde_json::to_string(&response).unwrap();
751        assert!(json.contains("\"id\""));
752        assert!(json.contains("\"url\""));
753        assert!(json.contains("\"processed_size\""));
754    }
755
756    #[test]
757    fn test_capture_config_default() {
758        let config = CaptureConfig::default();
759        assert_eq!(config.max_content_length, DEFAULT_MAX_CONTENT_LENGTH);
760        assert!(config.strip_scripts);
761        assert!(config.strip_styles);
762        assert!(config.decode_entities);
763        assert!(config.normalize_whitespace);
764    }
765
766    #[tokio::test]
767    async fn test_capture_buffer_channel() {
768        let (tx, mut rx) = create_capture_buffer(10);
769
770        let capture = ProcessedCapture {
771            id: Uuid::new_v4(),
772            url: "https://example.com".to_string(),
773            title: Some("Test".to_string()),
774            description: None,
775            content: "Hello World".to_string(),
776            original_html: None,
777            captured_at: Utc::now(),
778            processed_at: Utc::now(),
779            metadata: None,
780        };
781
782        tx.send(capture.clone()).await.unwrap();
783
784        let received = rx.recv().await.unwrap();
785        assert_eq!(received.url, "https://example.com");
786        assert_eq!(received.content, "Hello World");
787    }
788
789    #[test]
790    fn test_capture_error_into_response() {
791        let error = CaptureError::InvalidRequest("test error".to_string());
792        let response = error.into_response();
793        assert_eq!(response.status(), StatusCode::BAD_REQUEST);
794
795        let error = CaptureError::ContentTooLarge { size: 100, max: 50 };
796        let response = error.into_response();
797        assert_eq!(response.status(), StatusCode::PAYLOAD_TOO_LARGE);
798
799        let error = CaptureError::ProcessingError("failed".to_string());
800        let response = error.into_response();
801        assert_eq!(response.status(), StatusCode::INTERNAL_SERVER_ERROR);
802    }
803
804    #[test]
805    fn test_html_entity_decoding_comprehensive() {
806        let text = "&nbsp;&lt;&gt;&amp;&quot;&#39;&apos;&mdash;&ndash;&hellip;";
807        let decoded = ContentProcessor::decode_html_entities(text);
808
809        assert!(decoded.contains('<'));
810        assert!(decoded.contains('>'));
811        assert!(decoded.contains('&'));
812        assert!(decoded.contains('"'));
813        assert!(decoded.contains('\''));
814    }
815}