Skip to main content

iri_string/
convert.rs

1//! Conversion between URI/IRI types.
2
3use core::fmt;
4
5#[cfg(feature = "alloc")]
6use alloc::collections::TryReserveError;
7#[cfg(all(feature = "alloc", not(feature = "std")))]
8use alloc::string::String;
9
10#[cfg(feature = "alloc")]
11use crate::format::{ToDedicatedString, ToStringFallible};
12use crate::spec::Spec;
13use crate::types::{
14    RiAbsoluteStr, RiFragmentStr, RiQueryStr, RiReferenceStr, RiRelativeStr, RiStr,
15};
16#[cfg(feature = "alloc")]
17use crate::types::{
18    RiAbsoluteString, RiFragmentString, RiQueryString, RiReferenceString, RiRelativeString,
19    RiString,
20};
21
22/// Hexadecimal digits for a nibble.
23const HEXDIGITS: [u8; 16] = [
24    b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9', b'A', b'B', b'C', b'D', b'E', b'F',
25];
26
27/// A resource identifier mapped to a URI of some kind.
28///
29/// Supported `Src` types are:
30///
31/// * IRIs:
32///     + [`IriAbsoluteStr`] (alias of `RiAbsoluteStr<IriSpec>`)
33///     + [`IriReferenceStr`] (alias of `RiReferenceStr<IriSpec>`)
34///     + [`IriRelativeStr`] (alias of `RiRelativeStr<IriSpec>`)
35///     + [`IriStr`] (alias of `RiStr<IriSpec>`)
36/// * URIs:
37///     + [`UriAbsoluteStr`] (alias of `RiAbsoluteStr<UriSpec>`)
38///     + [`UriReferenceStr`] (alias of `RiReferenceStr<UriSpec>`)
39///     + [`UriRelativeStr`] (alias of `RiRelativeStr<UriSpec>`)
40///     + [`UriStr`] (alias of `RiStr<UriSpec>`)
41///
42/// # Examples
43///
44/// ```
45/// use iri_string::convert::MappedToUri;
46/// use iri_string::types::{IriStr, UriStr};
47///
48/// let src = IriStr::new("http://example.com/?alpha=\u{03B1}")?;
49/// // The type is `MappedToUri<IriStr>`, but you usually don't need to specify.
50/// let mapped = MappedToUri::from(src).to_string();
51/// assert_eq!(mapped, "http://example.com/?alpha=%CE%B1");
52/// # Ok::<_, iri_string::validate::Error>(())
53/// ```
54///
55/// [`IriAbsoluteStr`]: crate::types::IriAbsoluteStr
56/// [`IriReferenceStr`]: crate::types::IriReferenceStr
57/// [`IriRelativeStr`]: crate::types::IriRelativeStr
58/// [`IriStr`]: crate::types::IriStr
59/// [`UriAbsoluteStr`]: crate::types::UriAbsoluteStr
60/// [`UriReferenceStr`]: crate::types::UriReferenceStr
61/// [`UriRelativeStr`]: crate::types::UriRelativeStr
62/// [`UriStr`]: crate::types::UriStr
63#[derive(Debug, Clone, Copy)]
64pub struct MappedToUri<'a, Src: ?Sized>(&'a Src);
65
66/// Implement conversions for an IRI string type.
67macro_rules! impl_for_iri {
68    ($borrowed:ident, $owned:ident) => {
69        impl<S: Spec> fmt::Display for MappedToUri<'_, $borrowed<S>> {
70            fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
71                write_percent_encoded(f, self.0.as_str())
72            }
73        }
74
75        #[cfg(feature = "alloc")]
76        impl<S: Spec> ToDedicatedString for MappedToUri<'_, $borrowed<S>> {
77            type Target = $owned<$crate::spec::UriSpec>;
78
79            fn try_to_dedicated_string(&self) -> Result<Self::Target, TryReserveError> {
80                let s = self.try_to_string()?;
81                // SAFETY: Conversion from an IRI to a URI always succeeds, so
82                // the resulting string is always a valid URI.
83                Ok(unsafe {
84                    <Self::Target>::new_unchecked_justified(
85                        s,
86                        "an IRI must always be encodable into a valid URI",
87                    )
88                })
89            }
90        }
91
92        impl<'a, S: Spec> From<&'a $borrowed<S>> for MappedToUri<'a, $borrowed<S>> {
93            #[inline]
94            fn from(iri: &'a $borrowed<S>) -> Self {
95                Self(iri)
96            }
97        }
98
99        #[cfg(feature = "alloc")]
100        impl<'a, S: Spec> From<&'a $owned<S>> for MappedToUri<'a, $borrowed<S>> {
101            #[inline]
102            fn from(iri: &'a $owned<S>) -> Self {
103                Self(iri.as_slice())
104            }
105        }
106    };
107}
108
109impl_for_iri!(RiReferenceStr, RiReferenceString);
110impl_for_iri!(RiStr, RiString);
111impl_for_iri!(RiAbsoluteStr, RiAbsoluteString);
112impl_for_iri!(RiRelativeStr, RiRelativeString);
113impl_for_iri!(RiQueryStr, RiQueryString);
114impl_for_iri!(RiFragmentStr, RiFragmentString);
115
116/// Percent-encodes and writes the IRI string using the given buffer.
117fn write_percent_encoded(f: &mut fmt::Formatter<'_>, mut s: &str) -> fmt::Result {
118    while !s.is_empty() {
119        // Skip ASCII characters.
120        let non_ascii_pos = s.bytes().position(|b| !b.is_ascii()).unwrap_or(s.len());
121        let (ascii, rest) = s.split_at(non_ascii_pos);
122        if !ascii.is_empty() {
123            f.write_str(ascii)?;
124            s = rest;
125        }
126
127        if s.is_empty() {
128            return Ok(());
129        }
130
131        // Search for the next ASCII character.
132        let nonascii_end = s.bytes().position(|b| b.is_ascii()).unwrap_or(s.len());
133        let (nonasciis, rest) = s.split_at(nonascii_end);
134        debug_assert!(
135            !nonasciis.is_empty(),
136            "string without non-ASCII characters should have caused early return"
137        );
138        s = rest;
139
140        // Escape non-ASCII characters as percent-encoded bytes.
141        //
142        // RFC 3987 (section 3.1 step 2) says "for each character in
143        // 'ucschar' or 'iprivate'", but this simply means "for each
144        // non-ASCII characters" since any non-ASCII characters that can
145        // appear in an IRI match `ucschar` or `iprivate`.
146        /// Number of source bytes to encode at once.
147        const NUM_BYTES_AT_ONCE: usize = 21;
148        percent_encode_bytes(f, nonasciis, &mut [0_u8; NUM_BYTES_AT_ONCE * 3])?;
149    }
150
151    Ok(())
152}
153
154/// Percent-encode the string and pass the encoded chunks to the given function.
155///
156/// `buf` is used as a temporary working buffer. It is initialized by this
157/// function, so users can pass any mutable byte slice with enough size.
158///
159/// # Precondition
160///
161/// The length of `buf` must be 3 bytes or more.
162fn percent_encode_bytes(f: &mut fmt::Formatter<'_>, s: &str, buf: &mut [u8]) -> fmt::Result {
163    /// Fill the buffer by percent-encoded bytes.
164    ///
165    /// Note that this function applies percent-encoding to every characters,
166    /// even if it is ASCII alphabet.
167    ///
168    /// # Precondition
169    ///
170    /// * The length of `buf` must be 3 bytes or more.
171    /// * All of the `buf[i * 3]` elements should already be set to `b'%'`.
172    // This function have many preconditions and I don't want checks for them
173    // to be mandatory, so make this nested inner function.
174    fn fill_by_percent_encoded<'a>(buf: &'a mut [u8], bytes: &mut core::str::Bytes<'_>) -> &'a str {
175        let src_len = bytes.len();
176        // `<[u8; N]>::array_chunks_mut` is unstable as of Rust 1.58.1.
177        for (dest, byte) in buf.chunks_exact_mut(3).zip(bytes.by_ref()) {
178            debug_assert_eq!(
179                dest.len(),
180                3,
181                "`chunks_exact()` must return a slice with the exact length"
182            );
183            debug_assert_eq!(
184                dest[0], b'%',
185                "[precondition] the buffer must be properly initialized"
186            );
187
188            let upper = byte >> 4;
189            let lower = byte & 0b1111;
190            dest[1] = HEXDIGITS[usize::from(upper)];
191            dest[2] = HEXDIGITS[usize::from(lower)];
192        }
193        let num_dest_written = (src_len - bytes.len()) * 3;
194        let buf_filled = &buf[..num_dest_written];
195        // SAFETY: `b'%'` and `HEXDIGITS[_]` are all ASCII characters, so
196        // `buf_filled` is filled with ASCII characters and is valid UTF-8 bytes.
197        unsafe {
198            debug_assert!(core::str::from_utf8(buf_filled).is_ok());
199            core::str::from_utf8_unchecked(buf_filled)
200        }
201    }
202
203    assert!(
204        buf.len() >= 3,
205        "[precondition] length of `buf` must be 3 bytes or more"
206    );
207
208    // Drop the elements that will never be used.
209    // The length to be used is always a multiple of three.
210    let buf_len = buf.len() / 3 * 3;
211    let buf = &mut buf[..buf_len];
212
213    // Fill some bytes with `%`.
214    // This will be vectorized by optimization (especially for long buffers),
215    // so no need to selectively set `buf[i * 3]`.
216    buf.fill(b'%');
217
218    let mut bytes = s.bytes();
219    // `<core::str::Bytes as ExactSizeIterator>::is_empty` is unstable as of Rust 1.58.1.
220    while bytes.len() != 0 {
221        let encoded = fill_by_percent_encoded(buf, &mut bytes);
222        f.write_str(encoded)?;
223    }
224
225    Ok(())
226}
227
228/// Percent-encodes the given IRI using the given buffer.
229#[cfg(feature = "alloc")]
230pub(crate) fn try_percent_encode_iri_inline(
231    iri: &mut String,
232) -> Result<(), alloc::collections::TryReserveError> {
233    // Calculate the result length and extend the buffer.
234    let num_nonascii = count_nonascii(iri);
235    if num_nonascii == 0 {
236        // No need to escape.
237        return Ok(());
238    }
239    let additional = num_nonascii * 2;
240    iri.try_reserve(additional)?;
241    let src_len = iri.len();
242
243    // Temporarily take the ownership of the internal buffer.
244    let mut buf = core::mem::take(iri).into_bytes();
245    // `b'\0'` cannot appear in a valid IRI, so this default value would be
246    // useful in case of debugging.
247    buf.extend(core::iter::repeat(b'\0').take(additional));
248
249    // Fill the buffer from the tail to the head.
250    let mut dest_end = buf.len();
251    let mut src_end = src_len;
252    let mut rest_nonascii = num_nonascii;
253    while rest_nonascii > 0 {
254        debug_assert!(src_end > 0, "the source position should not overrun");
255        debug_assert!(dest_end > 0, "the destination position should not overrun");
256        src_end -= 1;
257        dest_end -= 1;
258        let byte = buf[src_end];
259        if byte.is_ascii() {
260            buf[dest_end] = byte;
261            // Use the ASCII character directly.
262        } else {
263            // Percent-encode the byte.
264            dest_end -= 2;
265            buf[dest_end] = b'%';
266            let upper = byte >> 4;
267            let lower = byte & 0b1111;
268            buf[dest_end + 1] = HEXDIGITS[usize::from(upper)];
269            buf[dest_end + 2] = HEXDIGITS[usize::from(lower)];
270            rest_nonascii -= 1;
271        }
272    }
273
274    // Move the result from the temporary buffer to the destination.
275    let s = String::from_utf8(buf).expect("the encoding result is an ASCII string");
276    *iri = s;
277    Ok(())
278}
279
280/// Returns the number of non-ASCII characters.
281#[cfg(feature = "alloc")]
282#[inline]
283#[must_use]
284fn count_nonascii(s: &str) -> usize {
285    s.bytes().filter(|b| !b.is_ascii()).count()
286}