fluent_uri/encoding/mod.rs
1//! Percent-encoding utilities.
2
3pub mod encoder;
4mod estring;
5mod imp;
6pub(crate) mod table;
7mod utf8;
8
9pub use estring::EString;
10pub use table::Table;
11
12pub(crate) use imp::{decode_octet, encode_byte, OCTET_TABLE_LO};
13pub(crate) use utf8::{next_code_point, Utf8Chunks};
14
15use crate::internal::PathEncoder;
16use alloc::{
17 borrow::{Cow, ToOwned},
18 string::{FromUtf8Error, String},
19 vec::Vec,
20};
21use core::{cmp::Ordering, hash, iter::FusedIterator, marker::PhantomData, str};
22use ref_cast::{ref_cast_custom, RefCastCustom};
23
24/// A trait used by [`EStr`] and [`EString`] to specify the table used for encoding.
25///
26/// # Sub-encoders
27///
28/// A sub-encoder `SubE` of `E` is an encoder such that `SubE::TABLE` is a [subset] of `E::TABLE`.
29///
30/// [subset]: Table::is_subset
31pub trait Encoder: 'static {
32 /// The table used for encoding.
33 const TABLE: &'static Table;
34}
35
36/// Percent-encoded string slices.
37///
38/// The owned counterpart of `EStr` is [`EString`]. See its documentation
39/// if you want to build a percent-encoded string from scratch.
40///
41/// # Type parameter
42///
43/// The `EStr<E>` type is parameterized over a type `E` that implements [`Encoder`].
44/// The associated constant `E::TABLE` of type [`Table`] specifies the byte patterns
45/// allowed in a string. In short, the underlying byte sequence of an `EStr<E>` slice
46/// can be formed by joining any number of the following byte sequences:
47///
48/// - `ch.encode_utf8(&mut [0; 4])` where `E::TABLE.allows(ch)`.
49/// - `[b'%', hi, lo]` where `E::TABLE.allows_pct_encoded() && hi.is_ascii_hexdigit() && lo.is_ascii_hexdigit()`.
50///
51/// # Comparison
52///
53/// `EStr` slices are compared [lexicographically](Ord#lexicographical-comparison)
54/// by their byte values. Normalization is **not** performed prior to comparison.
55///
56/// # Examples
57///
58/// Parse key-value pairs from a query string into a hash map:
59///
60/// ```
61/// use fluent_uri::{encoding::EStr, UriRef};
62/// use std::collections::HashMap;
63///
64/// let s = "?name=%E5%BC%A0%E4%B8%89&speech=%C2%A1Ol%C3%A9%21";
65/// let query = UriRef::parse(s)?.query().unwrap();
66/// let map: HashMap<_, _> = query
67/// .split('&')
68/// .map(|s| s.split_once('=').unwrap_or((s, EStr::EMPTY)))
69/// .map(|(k, v)| (k.decode().into_string_lossy(), v.decode().into_string_lossy()))
70/// .collect();
71/// assert_eq!(map["name"], "张三");
72/// assert_eq!(map["speech"], "¡Olé!");
73/// # Ok::<_, fluent_uri::error::ParseError>(())
74/// ```
75#[derive(RefCastCustom)]
76#[repr(transparent)]
77pub struct EStr<E: Encoder> {
78 encoder: PhantomData<E>,
79 inner: str,
80}
81
82struct Assert<L: Encoder, R: Encoder> {
83 _marker: PhantomData<(L, R)>,
84}
85
86impl<L: Encoder, R: Encoder> Assert<L, R> {
87 const L_IS_SUB_ENCODER_OF_R: () = assert!(L::TABLE.is_subset(R::TABLE), "not a sub-encoder");
88}
89
90impl<E: Encoder> EStr<E> {
91 const ASSERT_ALLOWS_PCT_ENCODED: () = assert!(
92 E::TABLE.allows_pct_encoded(),
93 "table does not allow percent-encoded octets"
94 );
95
96 /// Converts a string slice to an `EStr` slice assuming validity.
97 #[ref_cast_custom]
98 pub(crate) const fn new_validated(s: &str) -> &Self;
99
100 /// An empty `EStr` slice.
101 pub const EMPTY: &'static Self = Self::new_validated("");
102
103 pub(crate) fn cast<F: Encoder>(&self) -> &EStr<F> {
104 EStr::new_validated(&self.inner)
105 }
106
107 /// Converts a string slice to an `EStr` slice.
108 ///
109 /// # Panics
110 ///
111 /// Panics if the string is not properly encoded with `E`.
112 /// For a non-panicking variant, use [`new`](Self::new).
113 #[must_use]
114 pub const fn new_or_panic(s: &str) -> &Self {
115 match Self::new(s) {
116 Some(s) => s,
117 None => panic!("improperly encoded string"),
118 }
119 }
120
121 /// Converts a string slice to an `EStr` slice, returning `None` if the conversion fails.
122 #[must_use]
123 pub const fn new(s: &str) -> Option<&Self> {
124 if E::TABLE.validate(s.as_bytes()) {
125 Some(EStr::new_validated(s))
126 } else {
127 None
128 }
129 }
130
131 /// Yields the underlying string slice.
132 #[must_use]
133 pub fn as_str(&self) -> &str {
134 &self.inner
135 }
136
137 /// Returns the length of the `EStr` slice in bytes.
138 #[must_use]
139 pub fn len(&self) -> usize {
140 self.inner.len()
141 }
142
143 /// Checks whether the `EStr` slice is empty.
144 #[must_use]
145 pub fn is_empty(&self) -> bool {
146 self.inner.is_empty()
147 }
148
149 /// Upcasts the `EStr` slice to associate it with the given super-encoder.
150 ///
151 /// # Panics
152 ///
153 /// Panics at compile time if `E` is not a [sub-encoder](Encoder#sub-encoders) of `SuperE`.
154 ///
155 /// # Example
156 ///
157 /// ```
158 /// use fluent_uri::encoding::{encoder::{IPath, Path}, EStr};
159 ///
160 /// let path = EStr::<Path>::new_or_panic("foo");
161 /// let path: &EStr<IPath> = path.upcast();
162 /// ```
163 #[cfg(fluent_uri_unstable)]
164 #[must_use]
165 pub fn upcast<SuperE: Encoder>(&self) -> &EStr<SuperE> {
166 let () = Assert::<E, SuperE>::L_IS_SUB_ENCODER_OF_R;
167 EStr::new_validated(self.as_str())
168 }
169
170 /// Decodes the `EStr` slice.
171 ///
172 /// Always **split** before decoding, as otherwise the data may be
173 /// mistaken for component delimiters.
174 ///
175 /// This method allocates only when the slice contains any percent-encoded octet.
176 ///
177 /// Note that this method will **not** decode `U+002B` (+) as `0x20` (space).
178 ///
179 /// # Panics
180 ///
181 /// Panics at compile time if `E::TABLE` does not [allow percent-encoded octets].
182 ///
183 /// [allow percent-encoded octets]: Table::allows_pct_encoded
184 ///
185 /// # Examples
186 ///
187 /// ```
188 /// use fluent_uri::encoding::{encoder::Path, EStr};
189 ///
190 /// let dec = EStr::<Path>::new_or_panic("%C2%A1Hola%21").decode();
191 /// assert_eq!(dec.as_bytes(), &[0xc2, 0xa1, 0x48, 0x6f, 0x6c, 0x61, 0x21]);
192 /// assert_eq!(dec.into_string().unwrap(), "¡Hola!");
193 /// ```
194 #[must_use]
195 pub fn decode(&self) -> Decode<'_> {
196 let () = Self::ASSERT_ALLOWS_PCT_ENCODED;
197
198 match imp::decode(self.inner.as_bytes()) {
199 Some(vec) => Decode::Owned(vec),
200 None => Decode::Borrowed(self.as_str()),
201 }
202 }
203
204 /// Returns an iterator over subslices of the `EStr` slice separated by the given delimiter.
205 ///
206 /// # Panics
207 ///
208 /// Panics if the delimiter is not a [reserved] character.
209 ///
210 /// [reserved]: https://datatracker.ietf.org/doc/html/rfc3986#section-2.2
211 ///
212 /// # Examples
213 ///
214 /// ```
215 /// use fluent_uri::encoding::{encoder::Path, EStr};
216 ///
217 /// assert!(EStr::<Path>::new_or_panic("a,b,c").split(',').eq(["a", "b", "c"]));
218 /// assert!(EStr::<Path>::new_or_panic(",").split(',').eq(["", ""]));
219 /// assert!(EStr::<Path>::EMPTY.split(',').eq([""]));
220 /// ```
221 pub fn split(&self, delim: char) -> Split<'_, E> {
222 assert!(
223 delim.is_ascii() && table::RESERVED.allows(delim),
224 "splitting with non-reserved character"
225 );
226 Split {
227 inner: self.inner.split(delim),
228 encoder: PhantomData,
229 }
230 }
231
232 /// Splits the `EStr` slice on the first occurrence of the given delimiter and
233 /// returns prefix before delimiter and suffix after delimiter.
234 ///
235 /// Returns `None` if the delimiter is not found.
236 ///
237 /// # Panics
238 ///
239 /// Panics if the delimiter is not a [reserved] character.
240 ///
241 /// [reserved]: https://datatracker.ietf.org/doc/html/rfc3986#section-2.2
242 ///
243 /// # Examples
244 ///
245 /// ```
246 /// use fluent_uri::encoding::{encoder::Path, EStr};
247 ///
248 /// assert_eq!(
249 /// EStr::<Path>::new_or_panic("foo;bar;baz").split_once(';'),
250 /// Some((EStr::new_or_panic("foo"), EStr::new_or_panic("bar;baz")))
251 /// );
252 ///
253 /// assert_eq!(EStr::<Path>::new_or_panic("foo").split_once(';'), None);
254 /// ```
255 #[must_use]
256 pub fn split_once(&self, delim: char) -> Option<(&Self, &Self)> {
257 assert!(
258 delim.is_ascii() && table::RESERVED.allows(delim),
259 "splitting with non-reserved character"
260 );
261 self.inner
262 .split_once(delim)
263 .map(|(a, b)| (Self::new_validated(a), Self::new_validated(b)))
264 }
265
266 /// Splits the `EStr` slice on the last occurrence of the given delimiter and
267 /// returns prefix before delimiter and suffix after delimiter.
268 ///
269 /// Returns `None` if the delimiter is not found.
270 ///
271 /// # Panics
272 ///
273 /// Panics if the delimiter is not a [reserved] character.
274 ///
275 /// [reserved]: https://datatracker.ietf.org/doc/html/rfc3986#section-2.2
276 ///
277 /// # Examples
278 ///
279 /// ```
280 /// use fluent_uri::encoding::{encoder::Path, EStr};
281 ///
282 /// assert_eq!(
283 /// EStr::<Path>::new_or_panic("foo;bar;baz").rsplit_once(';'),
284 /// Some((EStr::new_or_panic("foo;bar"), EStr::new_or_panic("baz")))
285 /// );
286 ///
287 /// assert_eq!(EStr::<Path>::new_or_panic("foo").rsplit_once(';'), None);
288 /// ```
289 #[must_use]
290 pub fn rsplit_once(&self, delim: char) -> Option<(&Self, &Self)> {
291 assert!(
292 delim.is_ascii() && table::RESERVED.allows(delim),
293 "splitting with non-reserved character"
294 );
295 self.inner
296 .rsplit_once(delim)
297 .map(|(a, b)| (Self::new_validated(a), Self::new_validated(b)))
298 }
299}
300
301impl<E: Encoder> AsRef<Self> for EStr<E> {
302 fn as_ref(&self) -> &Self {
303 self
304 }
305}
306
307impl<E: Encoder> AsRef<str> for EStr<E> {
308 fn as_ref(&self) -> &str {
309 &self.inner
310 }
311}
312
313impl<E: Encoder> PartialEq for EStr<E> {
314 fn eq(&self, other: &Self) -> bool {
315 self.inner == other.inner
316 }
317}
318
319impl<E: Encoder> PartialEq<str> for EStr<E> {
320 fn eq(&self, other: &str) -> bool {
321 &self.inner == other
322 }
323}
324
325impl<E: Encoder> PartialEq<EStr<E>> for str {
326 fn eq(&self, other: &EStr<E>) -> bool {
327 self == &other.inner
328 }
329}
330
331impl<E: Encoder> Eq for EStr<E> {}
332
333impl<E: Encoder> hash::Hash for EStr<E> {
334 fn hash<H: hash::Hasher>(&self, state: &mut H) {
335 self.inner.hash(state);
336 }
337}
338
339impl<E: Encoder> PartialOrd for EStr<E> {
340 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
341 Some(self.cmp(other))
342 }
343}
344
345impl<E: Encoder> Ord for EStr<E> {
346 fn cmp(&self, other: &Self) -> Ordering {
347 self.inner.cmp(&other.inner)
348 }
349}
350
351impl<E: Encoder> Default for &EStr<E> {
352 /// Creates an empty `EStr` slice.
353 fn default() -> Self {
354 EStr::EMPTY
355 }
356}
357
358impl<E: Encoder> ToOwned for EStr<E> {
359 type Owned = EString<E>;
360
361 fn to_owned(&self) -> EString<E> {
362 EString::new_validated(self.inner.to_owned())
363 }
364
365 fn clone_into(&self, target: &mut EString<E>) {
366 self.inner.clone_into(&mut target.buf);
367 }
368}
369
370/// Extension methods for the [path] component.
371///
372/// [path]: https://datatracker.ietf.org/doc/html/rfc3986#section-3.3
373impl<E: PathEncoder> EStr<E> {
374 /// Checks whether the path is absolute, i.e., starting with `'/'`.
375 #[inline]
376 #[must_use]
377 pub fn is_absolute(&self) -> bool {
378 self.inner.starts_with('/')
379 }
380
381 /// Checks whether the path is rootless, i.e., not starting with `'/'`.
382 #[inline]
383 #[must_use]
384 pub fn is_rootless(&self) -> bool {
385 !self.inner.starts_with('/')
386 }
387
388 /// Returns an iterator over the path segments, separated by `'/'`.
389 ///
390 /// Returns `None` if the path is [rootless]. Use [`split`]
391 /// instead if you need to split a rootless path on occurrences of `'/'`.
392 ///
393 /// Note that the path can be [empty] when authority is present,
394 /// in which case this method will return `None`.
395 ///
396 /// [rootless]: Self::is_rootless
397 /// [`split`]: Self::split
398 /// [empty]: Self::is_empty
399 ///
400 /// # Examples
401 ///
402 /// ```
403 /// use fluent_uri::Uri;
404 ///
405 /// // Segments are separated by '/'.
406 /// // The empty string before a leading '/' is not a segment.
407 /// // However, segments can be empty in the other cases.
408 /// let path = Uri::parse("file:///path/to//dir/")?.path();
409 /// assert_eq!(path, "/path/to//dir/");
410 /// assert!(path.segments_if_absolute().unwrap().eq(["path", "to", "", "dir", ""]));
411 ///
412 /// let path = Uri::parse("foo:bar/baz")?.path();
413 /// assert_eq!(path, "bar/baz");
414 /// assert!(path.segments_if_absolute().is_none());
415 ///
416 /// let path = Uri::parse("http://example.com")?.path();
417 /// assert!(path.is_empty());
418 /// assert!(path.segments_if_absolute().is_none());
419 /// # Ok::<_, fluent_uri::error::ParseError>(())
420 /// ```
421 #[inline]
422 #[must_use]
423 pub fn segments_if_absolute(&self) -> Option<Split<'_, E>> {
424 self.inner
425 .strip_prefix('/')
426 .map(|s| EStr::new_validated(s).split('/'))
427 }
428}
429
430/// A wrapper of percent-decoded bytes.
431///
432/// This enum is created by [`EStr::decode`].
433#[derive(Clone, Debug)]
434pub enum Decode<'a> {
435 /// No percent-encoded octets are decoded.
436 Borrowed(&'a str),
437 /// One or more percent-encoded octets are decoded.
438 Owned(Vec<u8>),
439}
440
441impl<'a> Decode<'a> {
442 /// Returns a reference to the decoded bytes.
443 #[inline]
444 #[must_use]
445 pub fn as_bytes(&self) -> &[u8] {
446 match self {
447 Self::Borrowed(s) => s.as_bytes(),
448 Self::Owned(vec) => vec,
449 }
450 }
451
452 /// Consumes this `Decode` and yields the underlying decoded bytes.
453 #[inline]
454 #[must_use]
455 pub fn into_bytes(self) -> Cow<'a, [u8]> {
456 match self {
457 Self::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
458 Self::Owned(vec) => Cow::Owned(vec),
459 }
460 }
461
462 /// Converts the decoded bytes to a string.
463 ///
464 /// # Errors
465 ///
466 /// Returns `Err` if the bytes are not valid UTF-8.
467 #[inline]
468 pub fn into_string(self) -> Result<Cow<'a, str>, FromUtf8Error> {
469 match self {
470 Self::Borrowed(s) => Ok(Cow::Borrowed(s)),
471 Self::Owned(vec) => String::from_utf8(vec).map(Cow::Owned),
472 }
473 }
474
475 /// Converts the decoded bytes to a string, including invalid characters.
476 ///
477 /// This calls [`String::from_utf8_lossy`] if the bytes are not valid UTF-8.
478 #[must_use]
479 pub fn into_string_lossy(self) -> Cow<'a, str> {
480 match self.into_string() {
481 Ok(string) => string,
482 Err(e) => Cow::Owned(String::from_utf8_lossy(e.as_bytes()).into_owned()),
483 }
484 }
485}
486
487/// An iterator over subslices of an [`EStr`] slice separated by a delimiter.
488///
489/// This struct is created by [`EStr::split`].
490#[derive(Clone, Debug)]
491#[must_use = "iterators are lazy and do nothing unless consumed"]
492pub struct Split<'a, E: Encoder> {
493 inner: str::Split<'a, char>,
494 encoder: PhantomData<E>,
495}
496
497impl<'a, E: Encoder> Iterator for Split<'a, E> {
498 type Item = &'a EStr<E>;
499
500 fn next(&mut self) -> Option<&'a EStr<E>> {
501 self.inner.next().map(EStr::new_validated)
502 }
503}
504
505impl<'a, E: Encoder> DoubleEndedIterator for Split<'a, E> {
506 fn next_back(&mut self) -> Option<&'a EStr<E>> {
507 self.inner.next_back().map(EStr::new_validated)
508 }
509}
510
511impl<E: Encoder> FusedIterator for Split<'_, E> {}