async_coap_uri/escape/mod.rs
1// Copyright 2019 Google LLC
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14//
15
16//! # URI percent encoding/decoding ("URI Escaping")
17//!
18//! This module was written before the author was aware of the [`percent-encode`] crate.
19//! Nonetheless, it has some convenient features that are not present in that crate,
20//! so it remains in the X-URI crate for now.
21//!
22//! [`percent-encode`]: https://docs.rs/percent-encoding/2.0.0/percent_encoding/index.html
23//!
24//! It focuses solely on percent encoding/decoding of UTF8-encoded strings, treating all
25//! percent-encoded strings that would otherwise decode to invalid UTF8 as themselves invalid.
26//!
27//! The primary interface to encoding and decoding is a provided trait that extends `str`:
28//! [`StrExt`].
29//!
30//! Percent encoding is performed by [`escape_uri()`], which returns an iterator that
31//! escapes the string. Likewise, percent decoding is performed by [`unescape_uri()`].
32//!
33//! As a special case, the trait also provides [`unescape_uri_in_place()`], which performs
34//! in-place percent-decoding for a mutable string slice.
35//!
36//! # Usage Patterns
37//!
38//! The iterators returned by [`escape_uri()`] and [`unescape_uri()`] both implement
39//! [`core::fmt::Display`], and thus also implement [`std::string::ToString`]:
40//!
41//! ```
42//! use async_coap_uri::prelude::*;
43//! let escaped_string = "This needs escaping".escape_uri().to_string();
44//!
45//! assert_eq!(&escaped_string, "This%20needs%20escaping");
46//! ```
47//!
48//! Both methods also implent `From<Cow<str>>`:
49//!
50//! ```
51//! # use async_coap_uri::prelude::*;
52//! use std::borrow::Cow;
53//! use std::convert::From;
54//! let escaped_cow_str = Cow::from("This needs escaping?+3".escape_uri());
55//!
56//! assert_eq!(&escaped_cow_str, "This%20needs%20escaping%3F+3");
57//! ```
58//!
59//! # Changing Behavior
60//!
61//! There is no one-size-fits-all escaping strategy for URIs: Some parts need to be excaped
62//! differently than others. For example, *path segments* must have the `?` character escaped to
63//! `%3F`, but this character is perfectly acceptable in the *query component*. Also, query
64//! components have historically escaped the space character (` `) to the plus (`+`)
65//! character, so pluses need to be escaped to `%2B`.
66//!
67//! By default, [`StrExt::escape_uri`] produces an iterator suitable for encoding *path segments*,
68//! but other cases are handled by calling a modifier method on the [`EscapeUri`] iterator:
69//!
70//! ```
71//! # use async_coap_uri::prelude::*;
72//! let escaped_string = "This needs escaping?+3".escape_uri().for_query().to_string();
73//!
74//! assert_eq!(&escaped_string, "This+needs+escaping?%2B3");
75//! ```
76//!
77//! The [`EscapeUri`] iterator also provides the modifier methods `for_fragment()` and `full()`
78//! for encoding URI fragments and performing full percent encoding, respectively.
79//!
80//! ## Skipping Slashes
81//!
82//! The [`UnescapeUri`] iterator provides a modifier method for assisting in decoding the
83//! entire URI *path component* (as opposed to individual *path segments*) where encoded
84//! slashes (`%2F`) are not decoded, preserving the hierarchy:
85//!
86//! ```
87//! # use async_coap_uri::prelude::*;
88//! let escaped_string = "/this/p%20a%20t%20h/has%2Fextra/segments";
89//!
90//! let unescaped = escaped_string.unescape_uri().to_string();
91//! assert_eq!(&unescaped, "/this/p a t h/has/extra/segments");
92//!
93//! let unescaped = escaped_string.unescape_uri().skip_slashes().to_string();
94//! assert_eq!(&unescaped, "/this/p a t h/has%2Fextra/segments");
95//! ```
96//!
97//! # Handling Encoding Errors
98//!
99//! While [`escape_uri()`] cannot fail, an escaped string can contain errors.
100//! In situations where escaped characters cannot be properly decoded, the
101//! [`unescape_uri()`] iterator will by default insert replacement characters
102//! where errors are detected:
103//!
104//! * Illegal escaped UTF8 errors are replaced with [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD] (`�`).
105//! * Escaped ASCII control codes are decoded as [Unicode Control Pictures] like `␀` and `␊`.
106//! * Unescaped ASCII control codes are dropped entirely.
107//!
108//! In cases where this is not appropriate, the iterator for [`unescape_uri()`] ([`UnescapeUri`])
109//! provides the following methods:
110//!
111//! * [`first_error()`]\: Returns the location of the first detected encoding error, or `None` if
112//! there are no encoding errors.
113//! * [`try_to_string()`]\: Returns an unescaped [`String`] only if no encoding errors were present.
114//! * [`try_to_cow()`]\: Returns an unescaped [`Cow<str>`] only if no encoding errors were present.
115//!
116//! [U+FFFD]: core::char::REPLACEMENT_CHARACTER
117//! [Unicode Control Pictures]: https://www.unicode.org/charts/PDF/U2400.pdf
118//! [`escape_uri()`]: #method.escape_uri
119//! [`unescape_uri()`]: #method.unescape_uri
120//! [`unescape_uri_in_place()`]: #method.unescape_uri_in_place
121//! [`first_error()`]: struct.UnescapeUri.html#method.first_error
122//! [`try_to_string()`]: struct.UnescapeUri.html#method.try_to_string
123//! [`try_to_cow()`]: struct.UnescapeUri.html#method.try_to_cow
124//! [`EscapeUri`]: struct.EscapeUri.html
125//! [`UnescapeUri`]: struct.UnescapeUri.html
126//!
127mod escape_uri;
128pub use escape_uri::*;
129
130mod unescape_uri;
131pub use unescape_uri::*;
132
133#[cfg(test)]
134mod test;
135
136/// Trait for `str` adding URI percent encoding/decoding
137///
138/// See the [module-level](index.html) documentation for more details.
139///
140pub trait StrExt {
141 /// Gets an iterator that performs general-purpose URI percent-encoding.
142 ///
143 /// By default, all characters described by [`IETF-RFC3986`] as `pchar`s will be escaped,
144 /// which is appropriate for escaping path segments.
145 /// This behavior can be modified by appending the following modifiers:
146 ///
147 /// * [`full()`]: Escapes all characters except those which are `unreserved`.
148 /// * [`for_query()`]: Escaping appropriate for the query component.
149 /// * [`for_fragment()`]: Escaping appropriate for the fragment component.
150 ///
151 /// The returned iterator will escape ASCII control characters.
152 ///
153 /// [`full()`]: struct.EscapeUri#method.full
154 /// [`for_query()`]: struct.EscapeUri#method.for_query
155 /// [`for_fragment()`]: struct.EscapeUri#method.for_fragment
156 fn escape_uri(&self) -> EscapeUri<'_, EscapeUriSegment>;
157
158 /// Gets an iterator that performs URI percent-decoding.
159 ///
160 /// By default, when the iterator encounters an error the behavior is as follows:
161 ///
162 /// * Unescaped ASCII control codes are dropped.
163 /// * Escaped ASCII control codes are converted to [Unicode Control Pictures] (i.e. `%00` => `␀`)
164 /// * Bad percent-escape sequences (like `"%Foo"`) are replaced with [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]
165 /// * Incomplete UTF8 sequences (like `"%E2%82"`) are replaced with [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]
166 /// * Invalid UTF8 sequences (like `"%E2%82%E2"`) are replaced with [`U+FFFD REPLACEMENT CHARACTER`][U+FFFD]
167 ///
168 /// [U+FFFD]: core::char::REPLACEMENT_CHARACTER
169 /// [Unicode Control Pictures]: https://www.unicode.org/charts/PDF/U2400.pdf
170 fn unescape_uri(&self) -> UnescapeUri<'_>;
171
172 /// **Experimental:** Unescapes the given mutable string in-place, returning a subset of
173 /// the mutable slice even if it contains encoding errors or illegal characters.
174 ///
175 /// The behavior upon encountering errors is identical to that of
176 /// [`unescape_uri()`](#method.unescape_uri).
177 fn unescape_uri_in_place(&mut self) -> &mut str;
178}
179
180impl StrExt for str {
181 fn escape_uri(&self) -> EscapeUri<'_, EscapeUriSegment> {
182 EscapeUri {
183 iter: self.as_bytes().iter(),
184 state: EscapeUriState::Normal,
185 needs_escape: EscapeUriSegment,
186 }
187 }
188
189 fn unescape_uri(&self) -> UnescapeUri<'_> {
190 UnescapeUri {
191 iter: self.chars(),
192 iter_index: 0,
193 next_c: None,
194 had_error: false,
195 skip_slashes: false,
196 }
197 }
198
199 fn unescape_uri_in_place(&mut self) -> &mut str {
200 let mut ptr = self.as_mut_ptr();
201 let iter = self.unescape_uri();
202
203 for c in iter {
204 let mut buf = [0u8; 4];
205 for i in 0..c.encode_utf8(&mut buf).len() {
206 unsafe {
207 // SAFETY: The correctness of this code depends on the unescape
208 // iterator always being either at the same place or ahead
209 // of `ptr`. If this ever turns out to not be the case,
210 // the result will be corrupt.
211 *ptr = buf[i];
212 ptr = ptr.offset(1);
213 }
214 }
215 }
216
217 let len = (ptr as usize) - (self.as_mut_ptr() as usize);
218
219 &mut self[..len]
220 }
221}