1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
//! html: Go's html package — EscapeString / UnescapeString.
//!
//! Go goish
//! ───────────────────────────────── ──────────────────────────────────
//! html.EscapeString(s) html::EscapeString(s)
//! html.UnescapeString(s) html::UnescapeString(s)
//!
//! Scope: the five "core" HTML entities (& < > " ')
//! plus numeric (`&#NN;` / `&#xHH;`). The full HTML5 named-entity table
//! (~2000 entries) is not ported — it belongs to `html/template` which
//! is deferred to the long-tail milestone.
/// `html.EscapeString(s)` — replaces `<`, `>`, `&`, `"`, `'` with their
/// named entity equivalents.
#[allow(non_snake_case)]
pub fn EscapeString(s: impl AsRef<str>) -> String {
let s = s.as_ref();
let mut out = String::with_capacity(s.len());
for c in s.chars() {
match c {
'&' => out.push_str("&"),
'<' => out.push_str("<"),
'>' => out.push_str(">"),
'"' => out.push_str("""),
'\'' => out.push_str("'"),
_ => out.push(c),
}
}
out
}
/// `html.UnescapeString(s)` — reverses EscapeString, plus recognises
/// `&`, `<`, `>`, `"`, `'`, `&#NN;`, `&#xHH;`. Any
/// unrecognised `&...;` is left intact (Go: substrings that don't parse
/// as entities pass through).
#[allow(non_snake_case)]
pub fn UnescapeString(s: impl AsRef<str>) -> String {
let s = s.as_ref();
let bytes = s.as_bytes();
let mut out = String::with_capacity(s.len());
let mut i = 0;
while i < bytes.len() {
if bytes[i] != b'&' {
// UTF-8 safe: copy byte directly since valid UTF-8 bytes
// outside the ASCII range are passed through unchanged.
out.push(bytes[i] as char);
i += 1;
continue;
}
// Locate the terminating ';' (up to 10 bytes ahead is generous
// for the entities we care about).
let semi = bytes[i..].iter().take(10).position(|&b| b == b';');
match semi {
Some(len) => {
let entity = &s[i..i + len + 1]; // includes & and ;
if let Some(c) = decode_entity(entity) {
out.push(c);
i += len + 1;
continue;
}
// Unrecognised — pass through.
out.push('&');
i += 1;
}
None => {
out.push('&');
i += 1;
}
}
}
out
}
fn decode_entity(ent: &str) -> Option<char> {
// ent starts with '&' and ends with ';'.
let inner = &ent[1..ent.len() - 1];
match inner {
"amp" => Some('&'),
"lt" => Some('<'),
"gt" => Some('>'),
"quot" => Some('"'),
"apos" => Some('\''),
_ if inner.starts_with('#') => {
let digits = &inner[1..];
let (val, base) = if let Some(hex) = digits.strip_prefix('x').or_else(|| digits.strip_prefix('X')) {
(hex, 16)
} else {
(digits, 10)
};
u32::from_str_radix(val, base).ok().and_then(char::from_u32)
}
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn escape_specials() {
assert_eq!(EscapeString("a<b & c>d"), "a<b & c>d");
assert_eq!(EscapeString("\"'"), ""'");
}
#[test]
fn unescape_named() {
assert_eq!(UnescapeString("<hi>"), "<hi>");
assert_eq!(UnescapeString("&"), "&");
assert_eq!(UnescapeString("""), "\"");
assert_eq!(UnescapeString("'"), "'");
}
#[test]
fn unescape_numeric() {
assert_eq!(UnescapeString("A"), "A");
assert_eq!(UnescapeString("A"), "A");
assert_eq!(UnescapeString("😀"), "😀");
}
#[test]
fn unescape_unknown_passthrough() {
assert_eq!(UnescapeString("&unknown;"), "&unknown;");
assert_eq!(UnescapeString("a & b"), "a & b");
}
#[test]
fn round_trip_core() {
let s = "<script>alert(\"x & y\")</script>";
assert_eq!(UnescapeString(EscapeString(s)), s);
}
}