Skip to main content

contextual_encoder/
uri.rs

1//! URI component encoder.
2//!
3//! provides percent-encoding for URI components per RFC 3986.
4//!
5//! # security notes
6//!
7//! - this encoder is for **URI components** (query parameters, path segments,
8//!   fragment identifiers), not entire URLs.
9//! - it **cannot** make an untrusted full URL safe. a `javascript:` URL will
10//!   be percent-encoded but still execute. always validate the URL scheme and
11//!   structure separately before embedding untrusted URLs.
12//! - the output is safe for direct embedding in HTML, CSS, and javascript
13//!   contexts because all context-significant characters are percent-encoded.
14
15use std::fmt;
16
17// ---------------------------------------------------------------------------
18// for_uri_component
19// ---------------------------------------------------------------------------
20
21/// percent-encodes `input` for safe use as a URI component.
22///
23/// only unreserved characters per RFC 3986 pass through unencoded:
24/// `A-Z`, `a-z`, `0-9`, `-`, `.`, `_`, `~`. everything else is encoded
25/// as percent-encoded UTF-8 bytes.
26///
27/// # examples
28///
29/// ```
30/// use contextual_encoder::for_uri_component;
31///
32/// assert_eq!(for_uri_component("hello world"), "hello%20world");
33/// assert_eq!(for_uri_component("a=1&b=2"), "a%3D1%26b%3D2");
34/// assert_eq!(for_uri_component("safe-text_v2.0"), "safe-text_v2.0");
35/// assert_eq!(for_uri_component("café"), "caf%C3%A9");
36/// ```
37pub fn for_uri_component(input: &str) -> String {
38    let mut out = String::with_capacity(input.len());
39    write_uri_component(&mut out, input).expect("writing to string cannot fail");
40    out
41}
42
43/// writes the percent-encoded form of `input` to `out`.
44///
45/// see [`for_uri_component`] for encoding rules.
46pub fn write_uri_component<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
47    for byte in input.as_bytes() {
48        if is_unreserved(*byte) {
49            out.write_char(*byte as char)?;
50        } else {
51            write!(out, "%{:02X}", byte)?;
52        }
53    }
54    Ok(())
55}
56
57/// returns true if the byte represents an unreserved character per RFC 3986.
58fn is_unreserved(b: u8) -> bool {
59    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'.' | b'_' | b'~')
60}
61
62#[cfg(test)]
63mod tests {
64    use super::*;
65
66    #[test]
67    fn uri_component_no_encoding_needed() {
68        assert_eq!(for_uri_component("hello"), "hello");
69        assert_eq!(for_uri_component(""), "");
70        assert_eq!(for_uri_component("ABCxyz019"), "ABCxyz019");
71        assert_eq!(for_uri_component("-._~"), "-._~");
72    }
73
74    #[test]
75    fn uri_component_encodes_space() {
76        assert_eq!(for_uri_component("a b"), "a%20b");
77    }
78
79    #[test]
80    fn uri_component_encodes_reserved_chars() {
81        assert_eq!(for_uri_component("a=b"), "a%3Db");
82        assert_eq!(for_uri_component("a&b"), "a%26b");
83        assert_eq!(for_uri_component("a+b"), "a%2Bb");
84        assert_eq!(for_uri_component("a?b"), "a%3Fb");
85        assert_eq!(for_uri_component("a#b"), "a%23b");
86        assert_eq!(for_uri_component("a/b"), "a%2Fb");
87    }
88
89    #[test]
90    fn uri_component_encodes_html_significant() {
91        assert_eq!(for_uri_component("<script>"), "%3Cscript%3E");
92        assert_eq!(for_uri_component(r#""quoted""#), "%22quoted%22");
93    }
94
95    #[test]
96    fn uri_component_encodes_two_byte_utf8() {
97        // U+00A0 (NBSP) → 0xC2 0xA0
98        assert_eq!(for_uri_component("\u{00A0}"), "%C2%A0");
99        // U+00E9 (é) → 0xC3 0xA9
100        assert_eq!(for_uri_component("é"), "%C3%A9");
101    }
102
103    #[test]
104    fn uri_component_encodes_three_byte_utf8() {
105        // U+0800 → 0xE0 0xA0 0x80
106        assert_eq!(for_uri_component("\u{0800}"), "%E0%A0%80");
107        // U+4E16 (世) → 0xE4 0xB8 0x96
108        assert_eq!(for_uri_component("世"), "%E4%B8%96");
109    }
110
111    #[test]
112    fn uri_component_encodes_four_byte_utf8() {
113        // U+10000 → 0xF0 0x90 0x80 0x80
114        assert_eq!(for_uri_component("\u{10000}"), "%F0%90%80%80");
115        // U+1F600 (😀) → 0xF0 0x9F 0x98 0x80
116        assert_eq!(for_uri_component("😀"), "%F0%9F%98%80");
117    }
118
119    #[test]
120    fn uri_component_encodes_control_chars() {
121        assert_eq!(for_uri_component("\x00"), "%00");
122        assert_eq!(for_uri_component("\x1F"), "%1F");
123        assert_eq!(for_uri_component("\x7F"), "%7F");
124    }
125
126    #[test]
127    fn uri_component_mixed() {
128        assert_eq!(
129            for_uri_component("key=hello world&foo=bar"),
130            "key%3Dhello%20world%26foo%3Dbar"
131        );
132    }
133
134    #[test]
135    fn uri_component_writer_variant() {
136        let mut out = String::new();
137        write_uri_component(&mut out, "a b").unwrap();
138        assert_eq!(out, "a%20b");
139    }
140}