1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
//! Tiny `application/x-www-form-urlencoded` decoder.
//!
//! Three private copies of this lived in [`crate::signed_url`],
//! [`crate::auth_flows`], and [`crate::tenancy::admin`] before the
//! consolidation. URL decoders are a notorious source of security
//! bugs (overlong encodings, malformed `%xx` sequences, `+`/space
//! conflation, mixed-case hex) — keeping the implementation in one
//! place means a fix lands everywhere at once.
//!
//! ## Behavior
//!
//! * `+` → `' '` (the historical query-string convention; same
//! behavior `serde_urlencoded` and JavaScript's `decodeURIComponent`
//! *do not* implement, but every browser form encoder does, and
//! every server-side decoder we ship needs to honor it).
//! * `%XX` where both `X` are hex → that byte. Mixed case (`%Aa`)
//! accepted.
//! * `%XX` where either `X` is non-hex → the literal `%` is kept and
//! parsing continues at the next byte. Same convention as
//! `serde_urlencoded` + RFC 3986 §2.1: malformed escapes fall
//! through rather than aborting.
//! * Trailing `%` or `%X` (less than 2 bytes left) → kept as literal.
//! * Decoded byte stream that is not valid UTF-8 → replaced with the
//! Unicode replacement character (`U+FFFD`) via
//! [`String::from_utf8_lossy`]. This is a deliberate choice over
//! `String::from_utf8(out).unwrap_or_default()` (the previous
//! `signed_url` / `auth_flows` shape) — the unwrap-or-default
//! variant *silently wipes the entire output* on a single bad
//! byte, which hid both legitimate non-UTF-8 inputs and crafted
//! ones. Lossy preserves the well-formed prefix and surfaces the
//! error to the caller as a visible replacement char.
//!
//! ## What this is *not*
//!
//! Not a full RFC 3986 percent-decoder. Specifically, it doesn't
//! distinguish reserved characters by URI component (path vs query
//! vs fragment) — every `%XX` decodes regardless of position. Use
//! `url::Url` for parsing whole URLs; use this for body fields and
//! query-string values where the whole input is already known to be
//! `application/x-www-form-urlencoded`.
/// Percent-encode bytes outside the RFC 3986 *unreserved* set
/// (alphanumeric + `-` `_` `.` `~`). Used by URL-building
/// helpers that need to safely round-trip user input through a
/// query string. Does NOT encode `+` as space (that's a decoder
/// convention, not an encoder one) — encoders should leave the
/// space character as `%20`, which every browser accepts.
///
/// Mirror of the inline implementations in `template_views`'s
/// pagination URL builder. Centralizing keeps the encoder
/// table consistent across modules.
#[must_use]
pub fn url_encode(s: &str) -> String {
let mut out = String::with_capacity(s.len());
for b in s.bytes() {
if matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'_' | b'.' | b'~') {
out.push(b as char);
} else {
out.push_str(&format!("%{b:02X}"));
}
}
out
}
/// Decode a `application/x-www-form-urlencoded` string.
///
/// See module docs for malformed-input handling.
#[must_use]
pub(crate) fn url_decode(s: &str) -> String {
let bytes = s.as_bytes();
let mut out = Vec::with_capacity(bytes.len());
let mut i = 0;
while i < bytes.len() {
if bytes[i] == b'%' && i + 2 < bytes.len() {
let hex = std::str::from_utf8(&bytes[i + 1..i + 3]).unwrap_or("");
if let Ok(b) = u8::from_str_radix(hex, 16) {
out.push(b);
i += 3;
continue;
}
}
out.push(if bytes[i] == b'+' { b' ' } else { bytes[i] });
i += 1;
}
String::from_utf8_lossy(&out).into_owned()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn plain_text_passes_through() {
assert_eq!(url_decode("hello"), "hello");
}
#[test]
fn empty_string_yields_empty() {
assert_eq!(url_decode(""), "");
}
#[test]
fn percent_20_becomes_space() {
assert_eq!(url_decode("hello%20world"), "hello world");
}
#[test]
fn plus_becomes_space() {
assert_eq!(url_decode("hello+world"), "hello world");
}
#[test]
fn percent_2b_decodes_to_literal_plus() {
// `%2B` is the encoded form of `+`; round-trip must NOT be
// confused with the `+ → space` convention.
assert_eq!(url_decode("a%2Bb"), "a+b");
}
#[test]
fn mixed_plus_and_percent() {
assert_eq!(url_decode("hello+world%21"), "hello world!");
}
#[test]
fn mixed_case_hex_accepted() {
assert_eq!(url_decode("%2A%2a%2F%2f"), "**//");
}
#[test]
fn unicode_via_utf8_bytes() {
// `café` = 0x63 0x61 0x66 0xC3 0xA9
assert_eq!(url_decode("caf%C3%A9"), "café");
}
#[test]
fn malformed_percent_kept_as_literal() {
// `%2X` is not a valid escape — literal `%` survives, then
// continues parsing at the `2`.
assert_eq!(url_decode("a%2Xb"), "a%2Xb");
}
#[test]
fn malformed_non_hex_first_digit() {
assert_eq!(url_decode("a%XYb"), "a%XYb");
}
#[test]
fn trailing_percent_kept_as_literal() {
// Only 1 byte after `%` — escape can't complete.
assert_eq!(url_decode("foo%"), "foo%");
// Only 2 bytes after `%` but second is missing; spec says
// keep `%` and try `2` as a normal char. (i+2 < len fails.)
assert_eq!(url_decode("foo%2"), "foo%2");
}
#[test]
fn invalid_utf8_is_replaced_not_dropped() {
// 0xC3 alone is an incomplete UTF-8 sequence (lead byte
// for a 2-byte char with no continuation). The OLD impl
// (`from_utf8(out).unwrap_or_default()`) would return ""
// — a silent total wipe of the rest of the input. Lossy
// returns the well-formed prefix + U+FFFD for the bad byte.
let got = url_decode("hello%C3");
assert!(got.starts_with("hello"), "got: {got:?}");
// Trailing U+FFFD or kept literal `%C3` (since `i+2 < len`
// fails on the 2-char tail, we hit the literal-keep arm).
assert!(
got.contains("%C3") || got.contains('\u{FFFD}'),
"got: {got:?}"
);
}
#[test]
fn invalid_utf8_in_middle_keeps_well_formed_tail() {
// A real malformed sequence in the middle: `%C3%28` — `%C3`
// is a valid UTF-8 lead byte but `%28` (=`(`) is NOT a valid
// continuation byte. The lossy decoder must keep the prefix,
// emit U+FFFD for the bad byte, and KEEP DECODING the tail.
let got = url_decode("a%C3%28b");
assert!(got.starts_with('a'), "got: {got:?}");
assert!(got.ends_with('b'), "got: {got:?}");
assert!(
got.contains('\u{FFFD}'),
"expected replacement char, got: {got:?}"
);
}
#[test]
fn no_panic_on_arbitrary_input() {
// Smoke: feed a few weird strings and confirm no panic +
// some output.
for s in ["%", "%%", "%%%", "+%", "%+", "+%2", "%2+"] {
let _ = url_decode(s);
}
}
#[test]
fn dollar_amp_equal_unchanged() {
// Reserved characters that aren't `%` or `+` pass through
// without alteration. Caller is expected to have already
// split on `&` / `=` etc.
assert_eq!(url_decode("a=b&c=d"), "a=b&c=d");
}
// ---- url_encode ----
#[test]
fn url_encode_unreserved_pass_through() {
assert_eq!(url_encode("plain"), "plain");
assert_eq!(url_encode("foo-bar.baz_~"), "foo-bar.baz_~");
assert_eq!(url_encode("AaZz09"), "AaZz09");
}
#[test]
fn url_encode_reserved_chars_percent_encoded() {
assert_eq!(url_encode("hello world"), "hello%20world");
assert_eq!(url_encode("a&b=c"), "a%26b%3Dc");
assert_eq!(url_encode("?#"), "%3F%23");
}
/// Round-trip: encode then decode reproduces the input. Confirms
/// the encoder and decoder agree on the unreserved set.
#[test]
fn url_encode_decode_round_trip() {
for input in [
"plain",
"hello world",
"a&b=c",
"café", // multibyte UTF-8
"100%off", // user input with `%`
"x_y-z.0", // mostly-unreserved
"?#&=+/!", // pile of reserved
] {
let encoded = url_encode(input);
let decoded = url_decode(&encoded);
assert_eq!(decoded, input, "round-trip failed on `{input}`");
}
}
}