buup/transformers/
html_decode.rs

1use crate::{Transform, TransformError, TransformerCategory};
2
3/// HTML decode transformer
4#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5pub struct HtmlDecode;
6
7/// Default test input for HTML Decode
8pub const DEFAULT_TEST_INPUT: &str = "<p>Hello & Welcome!</p>";
9
10impl Transform for HtmlDecode {
11    fn name(&self) -> &'static str {
12        "HTML Decode"
13    }
14
15    fn id(&self) -> &'static str {
16        "htmldecode"
17    }
18
19    fn description(&self) -> &'static str {
20        "Decodes HTML entities (e.g., &lt;) back into characters (<)."
21    }
22
23    fn category(&self) -> TransformerCategory {
24        TransformerCategory::Decoder
25    }
26
27    fn default_test_input(&self) -> &'static str {
28        "&lt;p&gt;Hello &amp; Welcome!&lt;&#47;p&gt;"
29    }
30
31    fn transform(&self, input: &str) -> Result<String, TransformError> {
32        if input.is_empty() {
33            return Ok(String::new());
34        }
35
36        // Initial capacity is input length (a reasonable guess, might be smaller after decoding)
37        let mut result = String::with_capacity(input.len());
38
39        let mut chars = input.chars().peekable();
40        while let Some(c) = chars.next() {
41            if c == '&' {
42                let mut entity = String::with_capacity(10); // Typical entity length is small
43                entity.push(c);
44
45                // Collect characters until ';' or max entity length (safety)
46                let mut entity_length = 1; // Already pushed '&'
47                const MAX_ENTITY_LENGTH: usize = 12; // Practical limit for an HTML entity
48
49                while let Some(&next_char) = chars.peek() {
50                    if next_char == ';' || entity_length >= MAX_ENTITY_LENGTH {
51                        entity.push(next_char);
52                        chars.next(); // Consume the character
53                        break;
54                    }
55                    entity.push(next_char);
56                    chars.next(); // Consume the character
57                    entity_length += 1;
58                }
59
60                // Attempt to decode the entity
61                if let Some(decoded) = decode_html_entity(&entity) {
62                    result.push(decoded);
63                } else {
64                    // If we can't decode, pass through the original entity
65                    result.push_str(&entity);
66                }
67            } else {
68                result.push(c);
69            }
70        }
71
72        Ok(result)
73    }
74}
75
76// Decodes a single HTML entity to a character
77fn decode_html_entity(entity: &str) -> Option<char> {
78    match entity {
79        "&amp;" => Some('&'),
80        "&lt;" => Some('<'),
81        "&gt;" => Some('>'),
82        "&quot;" => Some('"'),
83        "&#39;" => Some('\''),
84        "&#47;" => Some('/'),
85        "&#96;" => Some('`'),
86        "&#61;" => Some('='),
87        // Add support for numeric entities
88        _ if entity.starts_with("&#x") && entity.ends_with(';') => {
89            // Handle hexadecimal numeric entity (e.g., &#x20AC;)
90            let hex_str = &entity[3..entity.len() - 1];
91            u32::from_str_radix(hex_str, 16)
92                .ok()
93                .and_then(std::char::from_u32)
94        }
95        _ if entity.starts_with("&#") && entity.ends_with(';') => {
96            // Handle decimal numeric entity (e.g., &#8364;)
97            let num_str = &entity[2..entity.len() - 1];
98            num_str.parse::<u32>().ok().and_then(std::char::from_u32)
99        }
100        _ => None,
101    }
102}
103
104#[cfg(test)]
105mod tests {
106    use super::*;
107
108    #[test]
109    fn test_html_decode() {
110        let decoder = HtmlDecode;
111
112        // Test default input
113        assert_eq!(
114            decoder.transform(DEFAULT_TEST_INPUT).unwrap(),
115            "<p>Hello & Welcome!</p>"
116        );
117
118        // Basic test with various entities
119        assert_eq!(
120            decoder
121                .transform("&lt;script&gt;alert(&quot;XSS attack&quot;);&lt;/script&gt;")
122                .unwrap(),
123            "<script>alert(\"XSS attack\");</script>"
124        );
125
126        // Test with various special characters
127        assert_eq!(
128            decoder.transform("a &lt; b &amp;&amp; c &gt; d").unwrap(),
129            "a < b && c > d"
130        );
131
132        // Test with single quotes and other characters
133        assert_eq!(
134            decoder
135                .transform("Don&#39;t use &#96;eval(input)&#96; or query&#61;&#39;unsafe&#39;")
136                .unwrap(),
137            "Don't use `eval(input)` or query='unsafe'"
138        );
139
140        // Test with numeric entities
141        assert_eq!(
142            decoder
143                .transform("Euro symbol: &#8364; or &#x20AC;")
144                .unwrap(),
145            "Euro symbol: € or €"
146        );
147
148        // Test with no entities
149        assert_eq!(
150            decoder.transform("Normal text with no entities").unwrap(),
151            "Normal text with no entities"
152        );
153
154        // Test with empty input
155        assert_eq!(decoder.transform("").unwrap(), "");
156
157        // Test with incomplete entities (should remain as is)
158        assert_eq!(
159            decoder.transform("This is an &incomplete entity").unwrap(),
160            "This is an &incomplete entity"
161        );
162
163        // Test with invalid entities (should remain as is)
164        assert_eq!(
165            decoder
166                .transform("This is &invalid; and &#invalid;")
167                .unwrap(),
168            "This is &invalid; and &#invalid;"
169        );
170    }
171}