fluent_uri/encoding/
mod.rs

1//! Percent-encoding utilities.
2
3pub mod encoder;
4mod estring;
5mod imp;
6pub(crate) mod table;
7mod utf8;
8
9pub use estring::EString;
10pub use table::Table;
11
12pub(crate) use imp::{decode_octet, encode_byte, OCTET_TABLE_LO};
13pub(crate) use utf8::{next_code_point, Utf8Chunks};
14
15use crate::internal::PathEncoder;
16use alloc::{
17    borrow::{Cow, ToOwned},
18    string::{FromUtf8Error, String},
19    vec::Vec,
20};
21use core::{cmp::Ordering, hash, iter::FusedIterator, marker::PhantomData, str};
22use ref_cast::{ref_cast_custom, RefCastCustom};
23
24/// A trait used by [`EStr`] and [`EString`] to specify the table used for encoding.
25///
26/// # Sub-encoders
27///
28/// A sub-encoder `SubE` of `E` is an encoder such that `SubE::TABLE` is a [subset] of `E::TABLE`.
29///
30/// [subset]: Table::is_subset
31pub trait Encoder: 'static {
32    /// The table used for encoding.
33    const TABLE: &'static Table;
34}
35
36/// Percent-encoded string slices.
37///
38/// The owned counterpart of `EStr` is [`EString`]. See its documentation
39/// if you want to build a percent-encoded string from scratch.
40///
41/// # Type parameter
42///
43/// The `EStr<E>` type is parameterized over a type `E` that implements [`Encoder`].
44/// The associated constant `E::TABLE` of type [`Table`] specifies the byte patterns
45/// allowed in a string. In short, the underlying byte sequence of an `EStr<E>` slice
46/// can be formed by joining any number of the following byte sequences:
47///
48/// - `ch.encode_utf8(&mut [0; 4])` where `E::TABLE.allows(ch)`.
49/// - `[b'%', hi, lo]` where `E::TABLE.allows_pct_encoded() && hi.is_ascii_hexdigit() && lo.is_ascii_hexdigit()`.
50///
51/// # Comparison
52///
53/// `EStr` slices are compared [lexicographically](Ord#lexicographical-comparison)
54/// by their byte values. Normalization is **not** performed prior to comparison.
55///
56/// # Examples
57///
58/// Parse key-value pairs from a query string into a hash map:
59///
60/// ```
61/// use fluent_uri::{encoding::EStr, UriRef};
62/// use std::collections::HashMap;
63///
64/// let s = "?name=%E5%BC%A0%E4%B8%89&speech=%C2%A1Ol%C3%A9%21";
65/// let query = UriRef::parse(s)?.query().unwrap();
66/// let map: HashMap<_, _> = query
67///     .split('&')
68///     .map(|s| s.split_once('=').unwrap_or((s, EStr::EMPTY)))
69///     .map(|(k, v)| (k.decode().into_string_lossy(), v.decode().into_string_lossy()))
70///     .collect();
71/// assert_eq!(map["name"], "张三");
72/// assert_eq!(map["speech"], "¡Olé!");
73/// # Ok::<_, fluent_uri::error::ParseError>(())
74/// ```
75#[derive(RefCastCustom)]
76#[repr(transparent)]
77pub struct EStr<E: Encoder> {
78    encoder: PhantomData<E>,
79    inner: str,
80}
81
82struct Assert<L: Encoder, R: Encoder> {
83    _marker: PhantomData<(L, R)>,
84}
85
86impl<L: Encoder, R: Encoder> Assert<L, R> {
87    const L_IS_SUB_ENCODER_OF_R: () = assert!(L::TABLE.is_subset(R::TABLE), "not a sub-encoder");
88}
89
90impl<E: Encoder> EStr<E> {
91    const ASSERT_ALLOWS_PCT_ENCODED: () = assert!(
92        E::TABLE.allows_pct_encoded(),
93        "table does not allow percent-encoded octets"
94    );
95
96    /// Converts a string slice to an `EStr` slice assuming validity.
97    #[ref_cast_custom]
98    pub(crate) const fn new_validated(s: &str) -> &Self;
99
100    /// An empty `EStr` slice.
101    pub const EMPTY: &'static Self = Self::new_validated("");
102
103    pub(crate) fn cast<F: Encoder>(&self) -> &EStr<F> {
104        EStr::new_validated(&self.inner)
105    }
106
107    /// Converts a string slice to an `EStr` slice.
108    ///
109    /// # Panics
110    ///
111    /// Panics if the string is not properly encoded with `E`.
112    /// For a non-panicking variant, use [`new`](Self::new).
113    #[must_use]
114    pub const fn new_or_panic(s: &str) -> &Self {
115        match Self::new(s) {
116            Some(s) => s,
117            None => panic!("improperly encoded string"),
118        }
119    }
120
121    /// Converts a string slice to an `EStr` slice, returning `None` if the conversion fails.
122    #[must_use]
123    pub const fn new(s: &str) -> Option<&Self> {
124        if E::TABLE.validate(s.as_bytes()) {
125            Some(EStr::new_validated(s))
126        } else {
127            None
128        }
129    }
130
131    /// Yields the underlying string slice.
132    #[must_use]
133    pub fn as_str(&self) -> &str {
134        &self.inner
135    }
136
137    /// Returns the length of the `EStr` slice in bytes.
138    #[must_use]
139    pub fn len(&self) -> usize {
140        self.inner.len()
141    }
142
143    /// Checks whether the `EStr` slice is empty.
144    #[must_use]
145    pub fn is_empty(&self) -> bool {
146        self.inner.is_empty()
147    }
148
149    /// Upcasts the `EStr` slice to associate it with the given super-encoder.
150    ///
151    /// # Panics
152    ///
153    /// Panics at compile time if `E` is not a [sub-encoder](Encoder#sub-encoders) of `SuperE`.
154    ///
155    /// # Example
156    ///
157    /// ```
158    /// use fluent_uri::encoding::{encoder::{IPath, Path}, EStr};
159    ///
160    /// let path = EStr::<Path>::new_or_panic("foo");
161    /// let path: &EStr<IPath> = path.upcast();
162    /// ```
163    #[cfg(fluent_uri_unstable)]
164    #[must_use]
165    pub fn upcast<SuperE: Encoder>(&self) -> &EStr<SuperE> {
166        let () = Assert::<E, SuperE>::L_IS_SUB_ENCODER_OF_R;
167        EStr::new_validated(self.as_str())
168    }
169
170    /// Decodes the `EStr` slice.
171    ///
172    /// Always **split** before decoding, as otherwise the data may be
173    /// mistaken for component delimiters.
174    ///
175    /// This method allocates only when the slice contains any percent-encoded octet.
176    ///
177    /// Note that this method will **not** decode `U+002B` (+) as `0x20` (space).
178    ///
179    /// # Panics
180    ///
181    /// Panics at compile time if `E::TABLE` does not [allow percent-encoded octets].
182    ///
183    /// [allow percent-encoded octets]: Table::allows_pct_encoded
184    ///
185    /// # Examples
186    ///
187    /// ```
188    /// use fluent_uri::encoding::{encoder::Path, EStr};
189    ///
190    /// let dec = EStr::<Path>::new_or_panic("%C2%A1Hola%21").decode();
191    /// assert_eq!(dec.as_bytes(), &[0xc2, 0xa1, 0x48, 0x6f, 0x6c, 0x61, 0x21]);
192    /// assert_eq!(dec.into_string().unwrap(), "¡Hola!");
193    /// ```
194    #[must_use]
195    pub fn decode(&self) -> Decode<'_> {
196        let () = Self::ASSERT_ALLOWS_PCT_ENCODED;
197
198        match imp::decode(self.inner.as_bytes()) {
199            Some(vec) => Decode::Owned(vec),
200            None => Decode::Borrowed(self.as_str()),
201        }
202    }
203
204    /// Returns an iterator over subslices of the `EStr` slice separated by the given delimiter.
205    ///
206    /// # Panics
207    ///
208    /// Panics if the delimiter is not a [reserved] character.
209    ///
210    /// [reserved]: https://datatracker.ietf.org/doc/html/rfc3986#section-2.2
211    ///
212    /// # Examples
213    ///
214    /// ```
215    /// use fluent_uri::encoding::{encoder::Path, EStr};
216    ///
217    /// assert!(EStr::<Path>::new_or_panic("a,b,c").split(',').eq(["a", "b", "c"]));
218    /// assert!(EStr::<Path>::new_or_panic(",").split(',').eq(["", ""]));
219    /// assert!(EStr::<Path>::EMPTY.split(',').eq([""]));
220    /// ```
221    pub fn split(&self, delim: char) -> Split<'_, E> {
222        assert!(
223            delim.is_ascii() && table::RESERVED.allows(delim),
224            "splitting with non-reserved character"
225        );
226        Split {
227            inner: self.inner.split(delim),
228            encoder: PhantomData,
229        }
230    }
231
232    /// Splits the `EStr` slice on the first occurrence of the given delimiter and
233    /// returns prefix before delimiter and suffix after delimiter.
234    ///
235    /// Returns `None` if the delimiter is not found.
236    ///
237    /// # Panics
238    ///
239    /// Panics if the delimiter is not a [reserved] character.
240    ///
241    /// [reserved]: https://datatracker.ietf.org/doc/html/rfc3986#section-2.2
242    ///
243    /// # Examples
244    ///
245    /// ```
246    /// use fluent_uri::encoding::{encoder::Path, EStr};
247    ///
248    /// assert_eq!(
249    ///     EStr::<Path>::new_or_panic("foo;bar;baz").split_once(';'),
250    ///     Some((EStr::new_or_panic("foo"), EStr::new_or_panic("bar;baz")))
251    /// );
252    ///
253    /// assert_eq!(EStr::<Path>::new_or_panic("foo").split_once(';'), None);
254    /// ```
255    #[must_use]
256    pub fn split_once(&self, delim: char) -> Option<(&Self, &Self)> {
257        assert!(
258            delim.is_ascii() && table::RESERVED.allows(delim),
259            "splitting with non-reserved character"
260        );
261        self.inner
262            .split_once(delim)
263            .map(|(a, b)| (Self::new_validated(a), Self::new_validated(b)))
264    }
265
266    /// Splits the `EStr` slice on the last occurrence of the given delimiter and
267    /// returns prefix before delimiter and suffix after delimiter.
268    ///
269    /// Returns `None` if the delimiter is not found.
270    ///
271    /// # Panics
272    ///
273    /// Panics if the delimiter is not a [reserved] character.
274    ///
275    /// [reserved]: https://datatracker.ietf.org/doc/html/rfc3986#section-2.2
276    ///
277    /// # Examples
278    ///
279    /// ```
280    /// use fluent_uri::encoding::{encoder::Path, EStr};
281    ///
282    /// assert_eq!(
283    ///     EStr::<Path>::new_or_panic("foo;bar;baz").rsplit_once(';'),
284    ///     Some((EStr::new_or_panic("foo;bar"), EStr::new_or_panic("baz")))
285    /// );
286    ///
287    /// assert_eq!(EStr::<Path>::new_or_panic("foo").rsplit_once(';'), None);
288    /// ```
289    #[must_use]
290    pub fn rsplit_once(&self, delim: char) -> Option<(&Self, &Self)> {
291        assert!(
292            delim.is_ascii() && table::RESERVED.allows(delim),
293            "splitting with non-reserved character"
294        );
295        self.inner
296            .rsplit_once(delim)
297            .map(|(a, b)| (Self::new_validated(a), Self::new_validated(b)))
298    }
299}
300
301impl<E: Encoder> AsRef<Self> for EStr<E> {
302    fn as_ref(&self) -> &Self {
303        self
304    }
305}
306
307impl<E: Encoder> AsRef<str> for EStr<E> {
308    fn as_ref(&self) -> &str {
309        &self.inner
310    }
311}
312
313impl<E: Encoder> PartialEq for EStr<E> {
314    fn eq(&self, other: &Self) -> bool {
315        self.inner == other.inner
316    }
317}
318
319impl<E: Encoder> PartialEq<str> for EStr<E> {
320    fn eq(&self, other: &str) -> bool {
321        &self.inner == other
322    }
323}
324
325impl<E: Encoder> PartialEq<EStr<E>> for str {
326    fn eq(&self, other: &EStr<E>) -> bool {
327        self == &other.inner
328    }
329}
330
331impl<E: Encoder> Eq for EStr<E> {}
332
333impl<E: Encoder> hash::Hash for EStr<E> {
334    fn hash<H: hash::Hasher>(&self, state: &mut H) {
335        self.inner.hash(state);
336    }
337}
338
339impl<E: Encoder> PartialOrd for EStr<E> {
340    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
341        Some(self.cmp(other))
342    }
343}
344
345impl<E: Encoder> Ord for EStr<E> {
346    fn cmp(&self, other: &Self) -> Ordering {
347        self.inner.cmp(&other.inner)
348    }
349}
350
351impl<E: Encoder> Default for &EStr<E> {
352    /// Creates an empty `EStr` slice.
353    fn default() -> Self {
354        EStr::EMPTY
355    }
356}
357
358impl<E: Encoder> ToOwned for EStr<E> {
359    type Owned = EString<E>;
360
361    fn to_owned(&self) -> EString<E> {
362        EString::new_validated(self.inner.to_owned())
363    }
364
365    fn clone_into(&self, target: &mut EString<E>) {
366        self.inner.clone_into(&mut target.buf);
367    }
368}
369
370/// Extension methods for the [path] component.
371///
372/// [path]: https://datatracker.ietf.org/doc/html/rfc3986#section-3.3
373impl<E: PathEncoder> EStr<E> {
374    /// Checks whether the path is absolute, i.e., starting with `'/'`.
375    #[inline]
376    #[must_use]
377    pub fn is_absolute(&self) -> bool {
378        self.inner.starts_with('/')
379    }
380
381    /// Checks whether the path is rootless, i.e., not starting with `'/'`.
382    #[inline]
383    #[must_use]
384    pub fn is_rootless(&self) -> bool {
385        !self.inner.starts_with('/')
386    }
387
388    /// Returns an iterator over the path segments, separated by `'/'`.
389    ///
390    /// Returns `None` if the path is [rootless]. Use [`split`]
391    /// instead if you need to split a rootless path on occurrences of `'/'`.
392    ///
393    /// Note that the path can be [empty] when authority is present,
394    /// in which case this method will return `None`.
395    ///
396    /// [rootless]: Self::is_rootless
397    /// [`split`]: Self::split
398    /// [empty]: Self::is_empty
399    ///
400    /// # Examples
401    ///
402    /// ```
403    /// use fluent_uri::Uri;
404    ///
405    /// // Segments are separated by '/'.
406    /// // The empty string before a leading '/' is not a segment.
407    /// // However, segments can be empty in the other cases.
408    /// let path = Uri::parse("file:///path/to//dir/")?.path();
409    /// assert_eq!(path, "/path/to//dir/");
410    /// assert!(path.segments_if_absolute().unwrap().eq(["path", "to", "", "dir", ""]));
411    ///
412    /// let path = Uri::parse("foo:bar/baz")?.path();
413    /// assert_eq!(path, "bar/baz");
414    /// assert!(path.segments_if_absolute().is_none());
415    ///
416    /// let path = Uri::parse("http://example.com")?.path();
417    /// assert!(path.is_empty());
418    /// assert!(path.segments_if_absolute().is_none());
419    /// # Ok::<_, fluent_uri::error::ParseError>(())
420    /// ```
421    #[inline]
422    #[must_use]
423    pub fn segments_if_absolute(&self) -> Option<Split<'_, E>> {
424        self.inner
425            .strip_prefix('/')
426            .map(|s| EStr::new_validated(s).split('/'))
427    }
428}
429
430/// A wrapper of percent-decoded bytes.
431///
432/// This enum is created by [`EStr::decode`].
433#[derive(Clone, Debug)]
434pub enum Decode<'a> {
435    /// No percent-encoded octets are decoded.
436    Borrowed(&'a str),
437    /// One or more percent-encoded octets are decoded.
438    Owned(Vec<u8>),
439}
440
441impl<'a> Decode<'a> {
442    /// Returns a reference to the decoded bytes.
443    #[inline]
444    #[must_use]
445    pub fn as_bytes(&self) -> &[u8] {
446        match self {
447            Self::Borrowed(s) => s.as_bytes(),
448            Self::Owned(vec) => vec,
449        }
450    }
451
452    /// Consumes this `Decode` and yields the underlying decoded bytes.
453    #[inline]
454    #[must_use]
455    pub fn into_bytes(self) -> Cow<'a, [u8]> {
456        match self {
457            Self::Borrowed(s) => Cow::Borrowed(s.as_bytes()),
458            Self::Owned(vec) => Cow::Owned(vec),
459        }
460    }
461
462    /// Converts the decoded bytes to a string.
463    ///
464    /// # Errors
465    ///
466    /// Returns `Err` if the bytes are not valid UTF-8.
467    #[inline]
468    pub fn into_string(self) -> Result<Cow<'a, str>, FromUtf8Error> {
469        match self {
470            Self::Borrowed(s) => Ok(Cow::Borrowed(s)),
471            Self::Owned(vec) => String::from_utf8(vec).map(Cow::Owned),
472        }
473    }
474
475    /// Converts the decoded bytes to a string, including invalid characters.
476    ///
477    /// This calls [`String::from_utf8_lossy`] if the bytes are not valid UTF-8.
478    #[must_use]
479    pub fn into_string_lossy(self) -> Cow<'a, str> {
480        match self.into_string() {
481            Ok(string) => string,
482            Err(e) => Cow::Owned(String::from_utf8_lossy(e.as_bytes()).into_owned()),
483        }
484    }
485}
486
487/// An iterator over subslices of an [`EStr`] slice separated by a delimiter.
488///
489/// This struct is created by [`EStr::split`].
490#[derive(Clone, Debug)]
491#[must_use = "iterators are lazy and do nothing unless consumed"]
492pub struct Split<'a, E: Encoder> {
493    inner: str::Split<'a, char>,
494    encoder: PhantomData<E>,
495}
496
497impl<'a, E: Encoder> Iterator for Split<'a, E> {
498    type Item = &'a EStr<E>;
499
500    fn next(&mut self) -> Option<&'a EStr<E>> {
501        self.inner.next().map(EStr::new_validated)
502    }
503}
504
505impl<'a, E: Encoder> DoubleEndedIterator for Split<'a, E> {
506    fn next_back(&mut self) -> Option<&'a EStr<E>> {
507        self.inner.next_back().map(EStr::new_validated)
508    }
509}
510
511impl<E: Encoder> FusedIterator for Split<'_, E> {}