tiny_clean/
xml_encoder.rs

1use crate::common::{char_bucket, char_mask};
2
3pub enum XmlEncoderMode {
4    All,
5    Content,
6    Attribute,
7    SingleQuotedAttribute,
8    DoubleQuotedAttribute,
9}
10
11pub struct XmlEncoder {
12    valid_masks: [u32; 4],
13}
14
15impl XmlEncoder {
16    pub fn new(mode: XmlEncoderMode) -> Self {
17        let base_mask = char_mask('\r') | char_mask('\t') | char_mask('\n');
18        match mode {
19            XmlEncoderMode::All => {
20                let to_be_encoded = ['&', '<', '>', '\'', '"'];
21                let mut to_be_encoded_mask = 0u32;
22                for char in to_be_encoded {
23                    to_be_encoded_mask |= char_mask(char);
24                }
25                let valid_masks = [
26                    base_mask,
27                    u32::MAX & !to_be_encoded_mask,
28                    u32::MAX,
29                    u32::MAX
30                ];
31                Self {valid_masks}
32            }
33            XmlEncoderMode::Content => {
34                let to_be_encoded = ['&', '<', '>'];
35                let mut to_be_encoded_mask = 0u32;
36                for char in to_be_encoded {
37                    to_be_encoded_mask |= char_mask(char);
38                }
39                let valid_masks = [
40                    base_mask,
41                    u32::MAX & !to_be_encoded_mask,
42                    u32::MAX,
43                    u32::MAX
44                ];
45                Self {valid_masks}
46            }
47            XmlEncoderMode::Attribute => {
48                let to_be_encoded = ['&', '<', '\'', '"'];
49                let mut to_be_encoded_mask = 0u32;
50                for char in to_be_encoded {
51                    to_be_encoded_mask |= char_mask(char);
52                }
53                let valid_masks = [
54                    base_mask,
55                    u32::MAX & !to_be_encoded_mask,
56                    u32::MAX,
57                    u32::MAX
58                ];
59                Self {valid_masks}
60            }
61            XmlEncoderMode::SingleQuotedAttribute => {
62                let to_be_encoded = ['&', '<', '\''];
63                let mut to_be_encoded_mask = 0u32;
64                for char in to_be_encoded {
65                    to_be_encoded_mask |= char_mask(char);
66                }
67                let valid_masks = [
68                    base_mask,
69                    u32::MAX & !to_be_encoded_mask,
70                    u32::MAX,
71                    u32::MAX
72                ];
73                Self {valid_masks}
74            }
75            XmlEncoderMode::DoubleQuotedAttribute => {
76                let to_be_encoded = ['&', '<', '"'];
77                let mut to_be_encoded_mask = 0u32;
78                for char in to_be_encoded {
79                    to_be_encoded_mask |= char_mask(char);
80                }
81                let valid_masks = [
82                    base_mask,
83                    u32::MAX & !to_be_encoded_mask,
84                    u32::MAX,
85                    u32::MAX
86                ];
87                Self {valid_masks}
88            }
89        }
90    }
91
92    pub fn encode(&self, input: &str) -> String {
93        let max_capacity = (u32::MAX / 2).min((input.len() * 5) as u32) as usize;
94        let mut result = String::with_capacity(max_capacity);
95        for c in input.chars() {
96            if (c as u32) < 127 {
97                let bucket = char_bucket(c);
98                let mask = char_mask(c);
99                if c > '>' || self.valid_masks[bucket] & mask != 0 {
100                    result.push(c);
101                } else {
102                    match c {
103                        '&' => {
104                            result.push('&');
105                            result.push('a');
106                            result.push('m');
107                            result.push('p');
108                            result.push(';');
109                        }
110                        '<' => {
111                            result.push('&');
112                            result.push('l');
113                            result.push('t');
114                            result.push(';');
115                        }
116                        '>' => {
117                            result.push('&');
118                            result.push('g');
119                            result.push('t');
120                            result.push(';');
121                        }
122                        '\'' => {
123                            result.push('&');
124                            result.push('#');
125                            result.push('3');
126                            result.push('9');
127                            result.push(';');
128                        }
129                        '\"' => {
130                            result.push('&');
131                            result.push('#');
132                            result.push('3');
133                            result.push('4');
134                            result.push(';');
135                        }
136                        _ => result.push(' '),
137                    }
138                }
139            } else if c > '\u{fffd}' || (c >= '\u{fdd0}' && c <= '\u{fdef}') {
140                result.push(' ');
141            } else {
142                result.push(c);
143            }
144        }
145        result.shrink_to_fit();
146        result
147    }
148}
149
150#[cfg(test)]
151mod test {
152    use crate::xml_encoder::{XmlEncoder, XmlEncoderMode};
153
154    fn generic_tests(encoder: &XmlEncoder) {
155        assert_eq!("\u{fffd}", encoder.encode("\u{fffd}"));
156        assert_eq!(" ", encoder.encode("\u{ffff}"));
157    }
158    #[test]
159    fn test_all_encode() {
160        let encoder = XmlEncoder::new(XmlEncoderMode::All);
161        assert_eq!("&amp;", encoder.encode("&"));
162        assert_eq!("&gt;", encoder.encode(">"));
163        assert_eq!("&lt;", encoder.encode("<"));
164        assert_eq!("&#39;", encoder.encode("\'"));
165        assert_eq!("&#34;", encoder.encode("\""));
166        generic_tests(&encoder);
167    }
168
169    #[test]
170    fn test_content_encode() {
171        let encoder = XmlEncoder::new(XmlEncoderMode::Content);
172        assert_eq!("&amp;", encoder.encode("&"));
173        assert_eq!("&gt;", encoder.encode(">"));
174        assert_eq!("&lt;", encoder.encode("<"));
175        assert_eq!("\'", encoder.encode("\'"));
176        assert_eq!("\"", encoder.encode("\""));
177        generic_tests(&encoder);
178    }
179
180    #[test]
181    fn test_attribute_encode() {
182        let encoder = XmlEncoder::new(XmlEncoderMode::Attribute);
183        assert_eq!("&amp;", encoder.encode("&"));
184        assert_eq!(">", encoder.encode(">"));
185        assert_eq!("&lt;", encoder.encode("<"));
186        assert_eq!("&#39;", encoder.encode("\'"));
187        assert_eq!("&#34;", encoder.encode("\""));
188        generic_tests(&encoder);
189    }
190
191    #[test]
192    fn test_single_quoted_encode() {
193        let encoder = XmlEncoder::new(XmlEncoderMode::SingleQuotedAttribute);
194        assert_eq!("&amp;", encoder.encode("&"));
195        assert_eq!(">", encoder.encode(">"));
196        assert_eq!("&lt;", encoder.encode("<"));
197        assert_eq!("&#39;", encoder.encode("\'"));
198
199        assert_eq!("\"", encoder.encode("\""));
200
201        generic_tests(&encoder);
202    }
203
204    #[test]
205    fn test_double_quoted_encode() {
206        let encoder = XmlEncoder::new(XmlEncoderMode::DoubleQuotedAttribute);
207        assert_eq!("&amp;", encoder.encode("&"));
208        assert_eq!(">", encoder.encode(">"));
209        assert_eq!("&lt;", encoder.encode("<"));
210        assert_eq!("&#34;", encoder.encode("\""));
211        assert_eq!("\'", encoder.encode("\'"));
212
213        generic_tests(&encoder);
214    }
215}