Skip to main content

contextual_encoder/
uri.rs

1//! URI component encoder.
2//!
3//! provides percent-encoding for URI components per RFC 3986.
4//!
5//! # security notes
6//!
7//! - this encoder is for **URI components** (query parameters, path segments,
8//!   fragment identifiers), not entire URLs.
9//! - it **cannot** make an untrusted full URL safe. a `javascript:` URL will
10//!   be percent-encoded but still execute. always validate the URL scheme and
11//!   structure separately before embedding untrusted URLs.
12//! - the output is safe for direct embedding in HTML, CSS, and javascript
13//!   contexts because all context-significant characters are percent-encoded.
14
15use std::fmt;
16
17// ---------------------------------------------------------------------------
18// for_uri_component
19// ---------------------------------------------------------------------------
20
21/// percent-encodes `input` for safe use as a URI component.
22///
23/// only unreserved characters per RFC 3986 pass through unencoded:
24/// `A-Z`, `a-z`, `0-9`, `-`, `.`, `_`, `~`. everything else is encoded
25/// as percent-encoded UTF-8 bytes.
26///
27/// # examples
28///
29/// ```
30/// use contextual_encoder::for_uri_component;
31///
32/// assert_eq!(for_uri_component("hello world"), "hello%20world");
33/// assert_eq!(for_uri_component("a=1&b=2"), "a%3D1%26b%3D2");
34/// assert_eq!(for_uri_component("safe-text_v2.0"), "safe-text_v2.0");
35/// assert_eq!(for_uri_component("café"), "caf%C3%A9");
36/// ```
37pub fn for_uri_component(input: &str) -> String {
38    let bytes = input.as_bytes();
39    let unreserved = bytes.iter().filter(|b| is_unreserved(**b)).count();
40    let capacity = unreserved + 3 * (bytes.len() - unreserved);
41    let mut out = String::with_capacity(capacity);
42    write_uri_component(&mut out, input).expect("writing to string cannot fail");
43    out
44}
45
46/// writes the percent-encoded form of `input` to `out`.
47///
48/// see [`for_uri_component`] for encoding rules.
49pub fn write_uri_component<W: fmt::Write>(out: &mut W, input: &str) -> fmt::Result {
50    let bytes = input.as_bytes();
51    let mut last_written = 0;
52
53    for (i, &byte) in bytes.iter().enumerate() {
54        if !is_unreserved(byte) {
55            // flush the preceding run of unreserved (ASCII) bytes
56            if last_written < i {
57                // safe: unreserved chars are all ASCII, so this slice is valid UTF-8
58                out.write_str(&input[last_written..i])?;
59            }
60            write!(out, "%{:02X}", byte)?;
61            last_written = i + 1;
62        }
63    }
64
65    // flush any trailing safe run
66    if last_written < bytes.len() {
67        out.write_str(&input[last_written..])?;
68    }
69    Ok(())
70}
71
72/// returns true if the byte represents an unreserved character per RFC 3986.
73fn is_unreserved(b: u8) -> bool {
74    matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'-' | b'.' | b'_' | b'~')
75}
76
77#[cfg(test)]
78mod tests {
79    use super::*;
80
81    #[test]
82    fn uri_component_no_encoding_needed() {
83        assert_eq!(for_uri_component("hello"), "hello");
84        assert_eq!(for_uri_component(""), "");
85        assert_eq!(for_uri_component("ABCxyz019"), "ABCxyz019");
86        assert_eq!(for_uri_component("-._~"), "-._~");
87    }
88
89    #[test]
90    fn uri_component_encodes_space() {
91        assert_eq!(for_uri_component("a b"), "a%20b");
92    }
93
94    #[test]
95    fn uri_component_encodes_reserved_chars() {
96        assert_eq!(for_uri_component("a=b"), "a%3Db");
97        assert_eq!(for_uri_component("a&b"), "a%26b");
98        assert_eq!(for_uri_component("a+b"), "a%2Bb");
99        assert_eq!(for_uri_component("a?b"), "a%3Fb");
100        assert_eq!(for_uri_component("a#b"), "a%23b");
101        assert_eq!(for_uri_component("a/b"), "a%2Fb");
102    }
103
104    #[test]
105    fn uri_component_encodes_html_significant() {
106        assert_eq!(for_uri_component("<script>"), "%3Cscript%3E");
107        assert_eq!(for_uri_component(r#""quoted""#), "%22quoted%22");
108    }
109
110    #[test]
111    fn uri_component_encodes_two_byte_utf8() {
112        // U+00A0 (NBSP) → 0xC2 0xA0
113        assert_eq!(for_uri_component("\u{00A0}"), "%C2%A0");
114        // U+00E9 (é) → 0xC3 0xA9
115        assert_eq!(for_uri_component("é"), "%C3%A9");
116    }
117
118    #[test]
119    fn uri_component_encodes_three_byte_utf8() {
120        // U+0800 → 0xE0 0xA0 0x80
121        assert_eq!(for_uri_component("\u{0800}"), "%E0%A0%80");
122        // U+4E16 (世) → 0xE4 0xB8 0x96
123        assert_eq!(for_uri_component("世"), "%E4%B8%96");
124    }
125
126    #[test]
127    fn uri_component_encodes_four_byte_utf8() {
128        // U+10000 → 0xF0 0x90 0x80 0x80
129        assert_eq!(for_uri_component("\u{10000}"), "%F0%90%80%80");
130        // U+1F600 (😀) → 0xF0 0x9F 0x98 0x80
131        assert_eq!(for_uri_component("😀"), "%F0%9F%98%80");
132    }
133
134    #[test]
135    fn uri_component_encodes_control_chars() {
136        assert_eq!(for_uri_component("\x00"), "%00");
137        assert_eq!(for_uri_component("\x1F"), "%1F");
138        assert_eq!(for_uri_component("\x7F"), "%7F");
139    }
140
141    #[test]
142    fn uri_component_mixed() {
143        assert_eq!(
144            for_uri_component("key=hello world&foo=bar"),
145            "key%3Dhello%20world%26foo%3Dbar"
146        );
147    }
148
149    #[test]
150    fn uri_component_writer_variant() {
151        let mut out = String::new();
152        write_uri_component(&mut out, "a b").unwrap();
153        assert_eq!(out, "a%20b");
154    }
155}