Skip to main content

pocopine_codec/
lib.rs

1//! Shared encoding / codec utilities for the pocopine workspace.
2//!
3//! The point of this crate is that crates never re-implement encoding helpers or
4//! their serde adapters and never reach for `base64` or `percent-encoding`
5//! directly. Add new codecs here rather than inlining them per crate.
6//!
7//! ```
8//! use pocopine_codec::{base64_decode, base64_encode};
9//!
10//! assert_eq!(base64_encode(b"hi"), "aGk=");
11//! assert_eq!(base64_decode("aGk=").unwrap(), b"hi");
12//! ```
13//!
14//! Percent-encoding goes through one component encoder (RFC 3986
15//! "unreserved" set — what URL path segments, query parts, and fragments
16//! want) plus a lossy decoder. Pass a custom [`AsciiSet`] to
17//! [`percent_encode_set`] when a backend needs a different escape set.
18//!
19//! ```
20//! use pocopine_codec::{percent_decode, percent_encode};
21//!
22//! assert_eq!(percent_encode("a b/c~d"), "a%20b%2Fc~d");
23//! assert_eq!(percent_decode("a%20b+c", true), "a b c");
24//! ```
25//!
26//! For a `Vec<u8>` struct field that should serialize as a base64 string, use the
27//! [`base64_bytes`] serde adapter:
28//!
29//! ```
30//! use serde::{Deserialize, Serialize};
31//!
32//! #[derive(Serialize, Deserialize)]
33//! struct Chunk {
34//!     #[serde(with = "pocopine_codec::base64_bytes", default, skip_serializing_if = "Vec::is_empty")]
35//!     payload: Vec<u8>,
36//! }
37//! ```
38//!
39//! This crate is `no_std` (it only needs `alloc`).
40
41#![cfg_attr(not(test), no_std)]
42
43extern crate alloc;
44
45use alloc::string::{String, ToString};
46use alloc::vec::Vec;
47
48pub use base64;
49pub use percent_encoding;
50// Re-exported so callers can express a custom percent-encode set
51// (e.g. a backend-specific path-segment set) without depending on
52// `percent-encoding` directly. Pair with [`percent_encode_set`].
53pub use percent_encoding::{AsciiSet, CONTROLS, NON_ALPHANUMERIC};
54
55use base64::engine::general_purpose::STANDARD;
56use base64::Engine as _;
57use percent_encoding::{percent_decode_str, utf8_percent_encode};
58
59/// Encode bytes as a standard (padded) base64 string.
60pub fn base64_encode(bytes: &[u8]) -> String {
61    STANDARD.encode(bytes)
62}
63
64/// Decode a standard (padded) base64 string.
65pub fn base64_decode(encoded: &str) -> Result<Vec<u8>, base64::DecodeError> {
66    STANDARD.decode(encoded)
67}
68
69/// RFC 3986 "component" encode set: everything **except** the unreserved
70/// set `A-Z` / `a-z` / `0-9` / `-` / `.` / `_` / `~` is percent-encoded
71/// (uppercase hex). Non-ASCII bytes are always encoded. This is the
72/// conservative encoder URL path segments, query keys/values, and
73/// fragments want — equivalent to JS `encodeURIComponent` minus its
74/// `!*'()` carve-outs.
75const COMPONENT: &AsciiSet = &NON_ALPHANUMERIC
76    .remove(b'-')
77    .remove(b'.')
78    .remove(b'_')
79    .remove(b'~');
80
81/// Percent-encode `s` with the RFC 3986 component set (see [`COMPONENT`]).
82///
83/// Use this for a single URL path segment, query key/value, or fragment.
84pub fn percent_encode(s: &str) -> String {
85    utf8_percent_encode(s, COMPONENT).to_string()
86}
87
88/// Percent-encode `s` with the component set, appending into `out`.
89///
90/// Allocation-free variant of [`percent_encode`] for callers building a
91/// URL piece by piece.
92pub fn percent_encode_into(out: &mut String, s: &str) {
93    out.extend(utf8_percent_encode(s, COMPONENT));
94}
95
96/// Percent-encode `s` with a caller-supplied [`AsciiSet`].
97///
98/// For the common case prefer [`percent_encode`]. Reach for this only when
99/// a backend needs a different escape set — build one from the re-exported
100/// [`CONTROLS`] / [`NON_ALPHANUMERIC`] + [`AsciiSet::add`]/[`AsciiSet::remove`].
101pub fn percent_encode_set(s: &str, set: &'static AsciiSet) -> String {
102    utf8_percent_encode(s, set).to_string()
103}
104
105/// Percent-decode `s`, lossily (invalid UTF-8 becomes U+FFFD, and a `%`
106/// not followed by two hex digits is passed through verbatim).
107///
108/// When `plus_as_space` is set, `+` is first replaced with a space — the
109/// `application/x-www-form-urlencoded` query semantics. Leave it `false`
110/// for path segments, where `+` is a literal plus.
111pub fn percent_decode(s: &str, plus_as_space: bool) -> String {
112    if plus_as_space && s.contains('+') {
113        let replaced = s.replace('+', " ");
114        percent_decode_str(&replaced)
115            .decode_utf8_lossy()
116            .into_owned()
117    } else {
118        percent_decode_str(s).decode_utf8_lossy().into_owned()
119    }
120}
121
122/// Serde adapter for a `Vec<u8>` field encoded as a base64 string.
123///
124/// Use via `#[serde(with = "pocopine_codec::base64_bytes")]`. Pair with
125/// `#[serde(default, skip_serializing_if = "Vec::is_empty")]` to omit empty values.
126pub mod base64_bytes {
127    use alloc::string::String;
128    use alloc::vec::Vec;
129
130    use serde::{Deserialize, Deserializer, Serializer};
131
132    pub fn serialize<S: Serializer>(bytes: &[u8], serializer: S) -> Result<S::Ok, S::Error> {
133        serializer.serialize_str(&super::base64_encode(bytes))
134    }
135
136    pub fn deserialize<'de, D: Deserializer<'de>>(deserializer: D) -> Result<Vec<u8>, D::Error> {
137        let encoded = String::deserialize(deserializer)?;
138        super::base64_decode(&encoded).map_err(serde::de::Error::custom)
139    }
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145
146    #[test]
147    fn base64_round_trips() {
148        for input in [b"".as_slice(), b"a", b"hello world", &[0u8, 255, 128, 1]] {
149            let encoded = base64_encode(input);
150            assert_eq!(base64_decode(&encoded).unwrap(), input);
151        }
152    }
153
154    #[test]
155    fn known_base64_vectors() {
156        assert_eq!(base64_encode(b"hi"), "aGk=");
157        assert_eq!(base64_decode("aGk=").unwrap(), b"hi");
158    }
159
160    #[test]
161    fn invalid_base64_is_rejected() {
162        assert!(base64_decode("not valid base64!!!").is_err());
163    }
164
165    #[derive(serde::Serialize, serde::Deserialize, PartialEq, Debug)]
166    struct Chunk {
167        #[serde(with = "base64_bytes", default, skip_serializing_if = "Vec::is_empty")]
168        payload: Vec<u8>,
169    }
170
171    #[test]
172    fn percent_encode_keeps_unreserved_and_escapes_the_rest() {
173        // Unreserved set passes through untouched.
174        assert_eq!(percent_encode("AZaz09-._~"), "AZaz09-._~");
175        // Reserved / delimiter bytes become uppercase %XX.
176        assert_eq!(percent_encode("a b/c?d#e"), "a%20b%2Fc%3Fd%23e");
177        // Non-ASCII is encoded byte-by-byte (UTF-8).
178        assert_eq!(percent_encode("é"), "%C3%A9");
179    }
180
181    #[test]
182    fn percent_encode_into_appends() {
183        let mut out = String::from("p/");
184        percent_encode_into(&mut out, "a b");
185        assert_eq!(out, "p/a%20b");
186    }
187
188    #[test]
189    fn percent_encode_set_honors_custom_set() {
190        // A set that only escapes spaces leaves `/` and `~` alone.
191        const SPACE_ONLY: &AsciiSet = &CONTROLS.add(b' ');
192        assert_eq!(percent_encode_set("a b/c~d", SPACE_ONLY), "a%20b/c~d");
193        // NON_ALPHANUMERIC escapes the unreserved punctuation too.
194        assert_eq!(percent_encode_set("a-b.c", NON_ALPHANUMERIC), "a%2Db%2Ec");
195    }
196
197    #[test]
198    fn percent_decode_round_trips_component() {
199        for input in ["", "plain", "a b/c?d#e", "é", "100%done"] {
200            assert_eq!(percent_decode(&percent_encode(input), false), input);
201        }
202    }
203
204    #[test]
205    fn percent_decode_plus_semantics() {
206        // Query semantics: `+` is a space.
207        assert_eq!(percent_decode("a+b%20c", true), "a b c");
208        // Path semantics: `+` is a literal plus.
209        assert_eq!(percent_decode("a+b%20c", false), "a+b c");
210    }
211
212    #[test]
213    fn percent_decode_is_lossy_and_passes_through_stray_percent() {
214        // Invalid UTF-8 becomes the replacement character.
215        assert_eq!(percent_decode("%FF", false), "\u{FFFD}");
216        // A `%` not followed by two hex digits is left verbatim.
217        assert_eq!(percent_decode("100%zz", false), "100%zz");
218    }
219
220    #[test]
221    fn serde_adapter_round_trips_and_omits_empty() {
222        let chunk = Chunk {
223            payload: alloc::vec![1, 2, 3, 4],
224        };
225        let json = serde_json::to_string(&chunk).unwrap();
226        assert!(json.contains("AQIDBA=="));
227        assert_eq!(serde_json::from_str::<Chunk>(&json).unwrap(), chunk);
228
229        let empty = Chunk {
230            payload: Vec::new(),
231        };
232        let json = serde_json::to_string(&empty).unwrap();
233        assert_eq!(json, "{}");
234        assert_eq!(serde_json::from_str::<Chunk>("{}").unwrap(), empty);
235    }
236}