fluent_uri/
normalize.rs

1//! Module for normalization.
2
3use crate::{
4    component::Scheme,
5    imp::{HostMeta, Meta, RiMaybeRef, RmrRef},
6    parse,
7    pct_enc::{
8        self,
9        encoder::{Data, IData},
10        Decode, DecodedChunk, DecodedUtf8Chunk, Encode, EncodedChunk, Encoder, Table,
11    },
12    resolve,
13};
14use alloc::string::String;
15use borrow_or_share::Bos;
16use core::{
17    fmt::{self, Write},
18    num::NonZeroUsize,
19};
20
21/// An error occurred when normalizing a URI/IRI (reference).
22#[derive(Clone, Copy, Debug, Eq, PartialEq)]
23pub enum NormalizeError {
24    /// An underflow occurred in path normalization.
25    ///
26    /// Used only when [`Normalizer::allow_path_underflow`] is set to `false`.
27    PathUnderflow,
28}
29
30impl fmt::Display for NormalizeError {
31    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32        let msg = match self {
33            Self::PathUnderflow => "underflow occurred in path resolution",
34        };
35        f.write_str(msg)
36    }
37}
38
39#[cfg(feature = "impl-error")]
40impl crate::Error for NormalizeError {}
41
42/// A configurable URI/IRI (reference) normalizer.
43#[derive(Clone, Copy)]
44#[allow(missing_debug_implementations)]
45#[must_use]
46pub struct Normalizer {
47    allow_path_underflow: bool,
48    default_port_f: fn(&Scheme) -> Option<u16>,
49}
50
51impl Normalizer {
52    /// Creates a new `Normalizer` with default configuration.
53    pub fn new() -> Self {
54        Self {
55            allow_path_underflow: true,
56            default_port_f: Scheme::default_port,
57        }
58    }
59
60    /// Sets whether to allow underflow in path normalization.
61    ///
62    /// This defaults to `true`. A value of `false` is a deviation from the
63    /// normalization methods described in
64    /// [Section 6 of RFC 3986](https://datatracker.ietf.org/doc/html/rfc3986/#section-6).
65    ///
66    /// # Examples
67    ///
68    /// ```
69    /// use fluent_uri::{normalize::{Normalizer, NormalizeError}, Uri};
70    ///
71    /// let normalizer = Normalizer::new().allow_path_underflow(false);
72    /// let uri = Uri::parse("http://example.com/..")?;
73    ///
74    /// assert_eq!(normalizer.normalize(&uri).unwrap_err(), NormalizeError::PathUnderflow);
75    /// # Ok::<_, fluent_uri::ParseError>(())
76    /// ```
77    pub fn allow_path_underflow(mut self, value: bool) -> Self {
78        self.allow_path_underflow = value;
79        self
80    }
81
82    /// Sets the function with which to get the default port of a scheme.
83    ///
84    /// This defaults to [`Scheme::default_port`].
85    ///
86    /// # Examples
87    ///
88    /// ```
89    /// use fluent_uri::{component::Scheme, normalize::Normalizer, Uri};
90    ///
91    /// const SCHEME_FOO: &Scheme = Scheme::new_or_panic("foo");
92    ///
93    /// let normalizer = Normalizer::new().default_port_with(|scheme| {
94    ///     if scheme == SCHEME_FOO {
95    ///         Some(4673)
96    ///     } else {
97    ///         scheme.default_port()
98    ///     }
99    /// });
100    /// let uri = Uri::parse("foo://localhost:4673")?;
101    ///
102    /// assert_eq!(normalizer.normalize(&uri).unwrap(), "foo://localhost");
103    /// # Ok::<_, fluent_uri::ParseError>(())
104    /// ```
105    pub fn default_port_with(mut self, f: fn(&Scheme) -> Option<u16>) -> Self {
106        self.default_port_f = f;
107        self
108    }
109
110    /// Normalizes the given URI/IRI (reference).
111    ///
112    /// See [`Uri::normalize`][crate::Uri::normalize] for the exact behavior of this method.
113    ///
114    /// # Errors
115    ///
116    /// Returns `Err` if an underflow occurred in path normalization
117    /// when [`allow_path_underflow`] is set to `false`.
118    ///
119    /// [`allow_path_underflow`]: Self::allow_path_underflow
120    pub fn normalize<R: RiMaybeRef>(&self, r: &R) -> Result<R::WithVal<String>, NormalizeError>
121    where
122        R::Val: Bos<str>,
123    {
124        normalize(
125            r.make_ref(),
126            R::CONSTRAINTS.ascii_only,
127            self.allow_path_underflow,
128            self.default_port_f,
129        )
130        .map(RiMaybeRef::from_pair)
131    }
132}
133
134impl Default for Normalizer {
135    fn default() -> Self {
136        Self::new()
137    }
138}
139
140pub(crate) fn normalize(
141    r: RmrRef<'_, '_>,
142    ascii_only: bool,
143    allow_path_underflow: bool,
144    default_port_f: fn(&Scheme) -> Option<u16>,
145) -> Result<(String, Meta), NormalizeError> {
146    // For "a://[::ffff:5:9]/" the capacity is not enough,
147    // but it's fine since this rarely happens.
148    let mut buf = String::with_capacity(r.as_str().len());
149
150    let path = r.path().as_str();
151    let mut path_buf = String::with_capacity(path.len());
152
153    let data_table = if ascii_only {
154        Data::TABLE
155    } else {
156        IData::TABLE
157    };
158
159    if r.has_scheme() && path.starts_with('/') {
160        normalize_estr(&mut buf, path, false, data_table);
161
162        let underflow_occurred = resolve::remove_dot_segments(&mut path_buf, 0, &[&buf]);
163        if underflow_occurred && !allow_path_underflow {
164            return Err(NormalizeError::PathUnderflow);
165        }
166
167        buf.clear();
168    } else {
169        // Don't remove dot segments from relative reference or rootless path.
170        normalize_estr(&mut path_buf, path, false, data_table);
171    }
172
173    let mut meta = Meta::default();
174
175    if let Some(scheme) = r.scheme_opt() {
176        buf.push_str(scheme.as_str());
177        buf.make_ascii_lowercase();
178        meta.scheme_end = NonZeroUsize::new(buf.len());
179        buf.push(':');
180    }
181
182    if let Some(auth) = r.authority() {
183        buf.push_str("//");
184
185        if let Some(userinfo) = auth.userinfo() {
186            normalize_estr(&mut buf, userinfo.as_str(), false, data_table);
187            buf.push('@');
188        }
189
190        let mut auth_meta = auth.meta();
191        auth_meta.host_bounds.0 = buf.len();
192        match auth_meta.host_meta {
193            // An IPv4 address is always canonical.
194            HostMeta::Ipv4(..) => buf.push_str(auth.host()),
195            #[cfg(feature = "net")]
196            HostMeta::Ipv6(addr) => write!(buf, "[{addr}]").unwrap(),
197            #[cfg(not(feature = "net"))]
198            HostMeta::Ipv6() => {
199                buf.push('[');
200                write_v6(&mut buf, parse::parse_v6(&auth.host().as_bytes()[1..]));
201                buf.push(']');
202            }
203            HostMeta::IpvFuture => {
204                let start = buf.len();
205                buf.push_str(auth.host());
206
207                buf[start..].make_ascii_lowercase();
208            }
209            HostMeta::RegName => {
210                let start = buf.len();
211                let host = auth.host();
212                normalize_estr(&mut buf, host, true, data_table);
213
214                if buf.len() < start + host.len() {
215                    // Only reparse when the length is less than before.
216                    auth_meta.host_meta = parse::parse_v4_or_reg_name(&buf.as_bytes()[start..]);
217                }
218            }
219        }
220        auth_meta.host_bounds.1 = buf.len();
221        meta.auth_meta = Some(auth_meta);
222
223        if let Some(port) = auth.port() {
224            if !port.is_empty() {
225                let mut eq_default = false;
226                if let Some(scheme) = r.scheme_opt() {
227                    if let Some(default) = default_port_f(scheme) {
228                        eq_default = port.as_str().parse().ok() == Some(default);
229                    }
230                }
231                if !eq_default {
232                    buf.push(':');
233                    buf.push_str(port.as_str());
234                }
235            }
236        }
237    }
238
239    meta.path_bounds.0 = buf.len();
240    // Make sure that the output is a valid URI/IRI reference.
241    if r.has_scheme() && !r.has_authority() && path_buf.starts_with("//") {
242        buf.push_str("/.");
243    }
244    buf.push_str(&path_buf);
245    meta.path_bounds.1 = buf.len();
246
247    if let Some(query) = r.query() {
248        buf.push('?');
249
250        const IQUERY_DATA: &Table = &IData::TABLE.or_iprivate();
251        let query_data_table = if ascii_only { Data::TABLE } else { IQUERY_DATA };
252
253        normalize_estr(&mut buf, query.as_str(), false, query_data_table);
254        meta.query_end = NonZeroUsize::new(buf.len());
255    }
256
257    if let Some(fragment) = r.fragment() {
258        buf.push('#');
259        normalize_estr(&mut buf, fragment.as_str(), false, data_table);
260    }
261
262    Ok((buf, meta))
263}
264
265fn normalize_estr(buf: &mut String, s: &str, to_ascii_lowercase: bool, table: &Table) {
266    if table.allows_non_ascii() {
267        Decode::new(s).decode_utf8(|chunk| match chunk {
268            DecodedUtf8Chunk::Unencoded(s) => {
269                let i = buf.len();
270                buf.push_str(s);
271                if to_ascii_lowercase {
272                    buf[i..].make_ascii_lowercase();
273                }
274            }
275            DecodedUtf8Chunk::Decoded { valid, invalid } => {
276                for chunk in Encode::new(table, valid) {
277                    match chunk {
278                        EncodedChunk::Unencoded(s) => {
279                            let i = buf.len();
280                            buf.push_str(s);
281                            if to_ascii_lowercase {
282                                buf[i..].make_ascii_lowercase();
283                            }
284                        }
285                        EncodedChunk::PctEncoded(s) => buf.push_str(s),
286                    }
287                }
288                for &x in invalid {
289                    buf.push_str(pct_enc::encode_byte(x));
290                }
291            }
292        });
293    } else {
294        for chunk in Decode::new(s) {
295            match chunk {
296                DecodedChunk::Unencoded(s) => {
297                    let i = buf.len();
298                    buf.push_str(s);
299                    if to_ascii_lowercase {
300                        buf[i..].make_ascii_lowercase();
301                    }
302                }
303                DecodedChunk::PctDecoded(mut x) => {
304                    if table.allows_ascii(x) {
305                        if to_ascii_lowercase {
306                            x.make_ascii_lowercase();
307                        }
308                        buf.push(x as char);
309                    } else {
310                        buf.push_str(pct_enc::encode_byte(x));
311                    }
312                }
313            }
314        }
315    }
316}
317
318// Taken from `impl Display for Ipv6Addr`.
319#[cfg(not(feature = "net"))]
320fn write_v6(buf: &mut String, segments: [u16; 8]) {
321    if let [0, 0, 0, 0, 0, 0xffff, ab, cd] = segments {
322        let [a, b] = ab.to_be_bytes();
323        let [c, d] = cd.to_be_bytes();
324        write!(buf, "::ffff:{a}.{b}.{c}.{d}").unwrap();
325    } else {
326        #[derive(Copy, Clone, Default)]
327        struct Span {
328            start: usize,
329            len: usize,
330        }
331
332        // Find the inner 0 span
333        let zeroes = {
334            let mut longest = Span::default();
335            let mut current = Span::default();
336
337            for (i, &segment) in segments.iter().enumerate() {
338                if segment == 0 {
339                    if current.len == 0 {
340                        current.start = i;
341                    }
342
343                    current.len += 1;
344
345                    if current.len > longest.len {
346                        longest = current;
347                    }
348                } else {
349                    current = Span::default();
350                }
351            }
352
353            longest
354        };
355
356        /// Write a colon-separated part of the address
357        #[inline]
358        fn write_subslice(buf: &mut String, chunk: &[u16]) {
359            if let Some((first, tail)) = chunk.split_first() {
360                write!(buf, "{first:x}").unwrap();
361                for segment in tail {
362                    write!(buf, ":{segment:x}").unwrap();
363                }
364            }
365        }
366
367        if zeroes.len > 1 {
368            write_subslice(buf, &segments[..zeroes.start]);
369            buf.push_str("::");
370            write_subslice(buf, &segments[zeroes.start + zeroes.len..]);
371        } else {
372            write_subslice(buf, &segments);
373        }
374    }
375}