iri_string/normalize/
path.rs

1//! Path normalization.
2
3use core::fmt;
4use core::ops::Range;
5
6use crate::parser::str::{find_split_hole, rfind};
7use crate::spec::{Spec, UriSpec};
8
9use super::pct_case::PctCaseNormalized;
10use super::{Error, NormalizationMode, NormalizationOp};
11
12/// Path that is (possibly) not yet processed or being processed.
13#[derive(Debug, Clone, Copy)]
14pub(crate) enum Path<'a> {
15    /// The result. No more processing is needed.
16    Done(&'a str),
17    /// Not yet completely processed path.
18    NeedsProcessing(PathToNormalize<'a>),
19}
20
21/// Path that needs merge and/or dot segment removal.
22///
23/// # Invariants
24///
25/// If the first field (prefix field) is not `None`, it must end with a slash.
26#[derive(Debug, Clone, Copy)]
27pub(crate) struct PathToNormalize<'a>(Option<&'a str>, &'a str);
28
29impl<'a> PathToNormalize<'a> {
30    /// Creates a `PathToNormalize` from the given single path.
31    #[inline]
32    #[must_use]
33    pub(crate) fn from_single_path(path: &'a str) -> Self {
34        Self(None, path)
35    }
36
37    /// Creates a `PathToNormalize` from the given base and reference paths to be resolved.
38    #[must_use]
39    pub(crate) fn from_paths_to_be_resolved(base: &'a str, reference: &'a str) -> Self {
40        if reference.starts_with('/') {
41            return Self(None, reference);
42        }
43
44        match rfind(base.as_bytes(), b'/') {
45            Some(last_slash_pos) => Self(Some(&base[..=last_slash_pos]), reference),
46            None => Self(None, reference),
47        }
48    }
49
50    /// Returns true if the path is empty string.
51    #[inline]
52    #[must_use]
53    fn is_empty(&self) -> bool {
54        // If `self.0` is `Some(_)`, it ends with a slash, i.e. it is not empty.
55        self.0.is_none() && self.1.is_empty()
56    }
57
58    /// Returns the length of the not yet normalized path.
59    #[inline]
60    #[must_use]
61    pub(super) fn len(&self) -> usize {
62        self.len_prefix() + self.1.len()
63    }
64
65    /// Returns the length of the prefix part.
66    ///
67    /// Returns 0 if the prefix part is empty.
68    #[inline]
69    #[must_use]
70    fn len_prefix(&self) -> usize {
71        self.0.map_or(0, |s| s.len())
72    }
73
74    /// Returns a byte at the given position.
75    #[must_use]
76    fn byte_at(&self, mut i: usize) -> Option<u8> {
77        if let Some(prefix) = self.0 {
78            if i < prefix.len() {
79                return Some(prefix.as_bytes()[i]);
80            }
81            i -= prefix.len();
82        }
83        self.1.as_bytes().get(i).copied()
84    }
85
86    /// Returns the position of the next slash of the byte at the given position.
87    #[must_use]
88    fn find_next_slash(&self, scan_start: usize) -> Option<usize> {
89        if let Some(prefix) = self.0 {
90            let prefix_len = prefix.len();
91            if scan_start < prefix_len {
92                prefix[scan_start..].find('/').map(|rel| rel + scan_start)
93            } else {
94                let local_i = scan_start - prefix_len;
95                self.1[local_i..].find('/').map(|rel| rel + scan_start)
96            }
97        } else {
98            self.1[scan_start..].find('/').map(|rel| rel + scan_start)
99        }
100    }
101
102    /// Removes the `len` characters from the beginning of `self`.
103    fn remove_start(&mut self, len: usize) {
104        if let Some(prefix) = self.0 {
105            if let Some(suffix_trim_len) = len.checked_sub(prefix.len()) {
106                self.0 = None;
107                self.1 = &self.1[suffix_trim_len..];
108            } else {
109                self.0 = Some(&prefix[len..]);
110            }
111        } else {
112            self.1 = &self.1[len..];
113        }
114    }
115
116    /// Removes the prefix that are ignorable on normalization.
117    // Skips the prefix dot segments without leading slashes (such as `./`,
118    // `../`, and `../.././`).
119    // This is necessary because such segments should be removed with the
120    // FOLLOWING slashes, not leading slashes.
121    fn remove_ignorable_prefix(&mut self) {
122        while let Some(seg) = PathSegmentsIter::new(self).next() {
123            if seg.has_leading_slash {
124                // The first segment starting with a slash is not target.
125                break;
126            }
127            match seg.kind(self) {
128                SegmentKind::Dot | SegmentKind::DotDot => {
129                    // Attempt to skip the following slash by `+ 1`.
130                    let skip = self.len().min(seg.range.end + 1);
131                    self.remove_start(skip);
132                }
133                SegmentKind::Normal => break,
134            }
135        }
136    }
137}
138
139impl PathToNormalize<'_> {
140    /// Writes the normalized path.
141    pub(crate) fn fmt_write_normalize<S: Spec, W: fmt::Write>(
142        &self,
143        f: &mut W,
144        op: NormalizationOp,
145        authority_is_present: bool,
146    ) -> fmt::Result {
147        debug_assert!(
148            self.0.map_or(true, |s| s.ends_with('/')),
149            "[validity] the prefix field of `PathToNormalize` should end with a slash"
150        );
151
152        if self.is_empty() {
153            return Ok(());
154        }
155
156        if (op.mode == NormalizationMode::PreserveAuthoritylessRelativePath)
157            && !authority_is_present
158            && self.byte_at(0) != Some(b'/')
159        {
160            // Treat the path as "opaque", i.e. do not apply dot segments removal.
161            // See <https://github.com/lo48576/iri-string/issues/29>.
162            debug_assert!(
163                op.mode.case_pct_normalization(),
164                "[consistency] case/pct normalization should still be applied"
165            );
166            if let Some(prefix) = self.0 {
167                write!(f, "{}", PctCaseNormalized::<S>::new(prefix))?;
168            }
169            write!(f, "{}", PctCaseNormalized::<S>::new(self.1))?;
170            return Ok(());
171        }
172
173        let mut rest = *self;
174
175        // Skip the prefix dot segments without leading slashes (such as `./`,
176        // `../`, and `../.././`).
177        // This is necessary because such segments should be removed with the
178        // FOLLOWING slashes, not leading slashes.
179        rest.remove_ignorable_prefix();
180        if rest.is_empty() {
181            // Path consists of only `/.`s and `/..`s.
182            // In this case, if the authority component is present, the result
183            // should be `/`, not empty.
184            if authority_is_present {
185                f.write_char('/')?;
186            }
187            return Ok(());
188        }
189
190        // None: No segments are written yet.
191        // Some(false): Something other than `/` is already written as the path.
192        // Some(true): Only a `/` is written as the path.
193        let mut only_a_slash_is_written = None;
194        let mut too_deep_area_may_have_dot_segments = true;
195        while !rest.is_empty() && too_deep_area_may_have_dot_segments {
196            /// The size of the queue to track the path segments.
197            ///
198            /// This should be nonzero.
199            const QUEUE_SIZE: usize = 8;
200
201            {
202                // Skip `/.` and `/..` segments at the head.
203                let mut skipped_len = 0;
204                for seg in PathSegmentsIter::new(&rest) {
205                    match seg.kind(&rest) {
206                        SegmentKind::Dot | SegmentKind::DotDot => {
207                            debug_assert!(
208                                seg.has_leading_slash,
209                                "[consistency] `.` or `..` segments without a
210                                 leading slash have already been skipped"
211                            );
212                            skipped_len = seg.range.end;
213                        }
214                        _ => break,
215                    }
216                }
217                rest.remove_start(skipped_len);
218                if rest.is_empty() {
219                    // Finished with a dot segment.
220                    // The last `/.` or `/..` should be replaced to `/`.
221                    if !authority_is_present && (only_a_slash_is_written == Some(true)) {
222                        // Insert a dot segment to break the prefix `//`.
223                        // Without this, the path starts with `//` and it may
224                        // be confused with the prefix of an authority.
225                        f.write_str(".//")?;
226                    } else {
227                        f.write_char('/')?;
228                    }
229                    break;
230                }
231            }
232
233            let mut queue: [Option<&'_ str>; QUEUE_SIZE] = Default::default();
234            let mut level: usize = 0;
235            let mut first_segment_has_leading_slash = false;
236
237            // Find higher path segments.
238            let mut end = 0;
239            for seg in PathSegmentsIter::new(&rest) {
240                let kind = seg.kind(&rest);
241                match kind {
242                    SegmentKind::Dot => {
243                        too_deep_area_may_have_dot_segments = true;
244                    }
245                    SegmentKind::DotDot => {
246                        level = level.saturating_sub(1);
247                        too_deep_area_may_have_dot_segments = true;
248                        if level < queue.len() {
249                            queue[level] = None;
250                        }
251                    }
252                    SegmentKind::Normal => {
253                        if level < queue.len() {
254                            queue[level] = Some(seg.segment(&rest));
255                            too_deep_area_may_have_dot_segments = false;
256                            end = seg.range.end;
257                            if level == 0 {
258                                first_segment_has_leading_slash = seg.has_leading_slash;
259                            }
260                        }
261                        level += 1;
262                    }
263                }
264            }
265
266            // Write the path segments as possible, and update the internal state.
267            for segname in queue.iter().flatten() {
268                Self::emit_segment::<S, _>(
269                    f,
270                    &mut only_a_slash_is_written,
271                    first_segment_has_leading_slash,
272                    segname,
273                    authority_is_present,
274                    op,
275                )?;
276            }
277
278            rest.remove_start(end);
279        }
280
281        if !rest.is_empty() {
282            // No need of searching dot segments anymore.
283            assert!(
284                !too_deep_area_may_have_dot_segments,
285                "[consistency] loop condition of the previous loop"
286            );
287            // Apply only normalization (if needed).
288            for seg in PathSegmentsIter::new(&rest) {
289                assert_eq!(
290                    seg.kind(&rest),
291                    SegmentKind::Normal,
292                    "[consistency] already confirmed that there are no more dot segments"
293                );
294                let segname = seg.segment(&rest);
295                Self::emit_segment::<S, _>(
296                    f,
297                    &mut only_a_slash_is_written,
298                    seg.has_leading_slash,
299                    segname,
300                    authority_is_present,
301                    op,
302                )?;
303            }
304        }
305
306        Ok(())
307    }
308
309    /// Emits a non-dot segment and update the current state.
310    //
311    // `first_segment_has_leading_slash` can be any value if the segment is not the first one.
312    fn emit_segment<S: Spec, W: fmt::Write>(
313        f: &mut W,
314        only_a_slash_is_written: &mut Option<bool>,
315        first_segment_has_leading_slash: bool,
316        segname: &str,
317        authority_is_present: bool,
318        op: NormalizationOp,
319    ) -> fmt::Result {
320        // Omit the leading slash of the segment only if the segment is
321        // the first one and marked as not having a leading slash.
322        match *only_a_slash_is_written {
323            None => {
324                // First segment.
325                // This pass can be possible if `./` is repeated `QUEUE_SIZE`
326                // times at the beginning.
327                if first_segment_has_leading_slash {
328                    f.write_char('/')?;
329                }
330                *only_a_slash_is_written =
331                    Some(first_segment_has_leading_slash && segname.is_empty());
332            }
333            Some(only_a_slash) => {
334                if only_a_slash && !authority_is_present {
335                    // Apply serialization like WHATWG URL Standard.
336                    // This prevents `<scheme=foo>:<path=//bar>` from written as
337                    // `foo://bar`, which is interpreted as
338                    // `<scheme=foo>://<authority=bar>`. Prepending `./`, the
339                    // serialization result would be `foo:/.//bar`, which is safe.
340                    f.write_str("./")?;
341                    *only_a_slash_is_written = Some(false);
342                }
343                f.write_char('/')?;
344            }
345        }
346
347        // Write the segment name.
348        if op.mode.case_pct_normalization() {
349            write!(f, "{}", PctCaseNormalized::<S>::new(segname))
350        } else {
351            f.write_str(segname)
352        }
353    }
354
355    /// Checks if the path is normalizable by RFC 3986 algorithm when the authority is absent.
356    ///
357    /// Returns `Ok(())` when normalizable, returns `Err(_)` if not.
358    pub(crate) fn ensure_rfc3986_normalizable_with_authority_absent(&self) -> Result<(), Error> {
359        /// A sink to get the prefix of the input.
360        #[derive(Default)]
361        struct PrefixRetriever {
362            /// The buffer to remember the prefix of the input.
363            buf: [u8; 3],
364            /// The next write position in the buffer.
365            cursor: usize,
366        }
367        impl PrefixRetriever {
368            /// Returns the read prefix data.
369            #[inline]
370            #[must_use]
371            fn as_bytes(&self) -> &[u8] {
372                &self.buf[..self.cursor]
373            }
374        }
375        impl fmt::Write for PrefixRetriever {
376            fn write_str(&mut self, s: &str) -> fmt::Result {
377                if !s.is_empty() && (self.cursor >= self.buf.len()) {
378                    // Enough bytes are read.
379                    return Err(fmt::Error);
380                }
381                self.buf[self.cursor..]
382                    .iter_mut()
383                    .zip(s.bytes())
384                    .for_each(|(dest, src)| *dest = src);
385                self.cursor = self.cursor.saturating_add(s.len()).min(self.buf.len());
386                Ok(())
387            }
388        }
389
390        let mut prefix = PrefixRetriever::default();
391        // The failure of this write indicates more than 3 characters are read.
392        // This is safe to ignore since the check needs only 3 characters.
393        let _ = self.fmt_write_normalize::<UriSpec, _>(
394            &mut prefix,
395            NormalizationOp {
396                mode: NormalizationMode::None,
397            },
398            // Assume the authority is absent.
399            false,
400        );
401
402        if prefix.as_bytes() == b"/./" {
403            Err(Error::new())
404        } else {
405            Ok(())
406        }
407    }
408}
409
410/// Characteristic of a path.
411#[derive(Debug, Clone, Copy)]
412pub(crate) enum PathCharacteristic {
413    /// Absolute path, not special.
414    CommonAbsolute,
415    /// Absolute path, not special.
416    CommonRelative,
417    /// The first path segment of the relative path has one or more colon characters.
418    RelativeFirstSegmentHasColon,
419    /// The path starts with the double slash.
420    StartsWithDoubleSlash,
421}
422
423impl PathCharacteristic {
424    /// Returns true if the path is absolute.
425    #[inline]
426    #[must_use]
427    pub(crate) fn is_absolute(self) -> bool {
428        matches!(self, Self::CommonAbsolute | Self::StartsWithDoubleSlash)
429    }
430
431    /// Returns the characteristic of the path.
432    pub(crate) fn from_path_to_display<S: Spec>(
433        path: &PathToNormalize<'_>,
434        op: NormalizationOp,
435        authority_is_present: bool,
436    ) -> Self {
437        /// Dummy writer to get necessary values.
438        #[derive(Default, Clone, Copy)]
439        struct Writer {
440            /// Result.
441            result: Option<PathCharacteristic>,
442            /// Whether the normalized path is absolute.
443            is_absolute: Option<bool>,
444        }
445        impl fmt::Write for Writer {
446            fn write_str(&mut self, mut s: &str) -> fmt::Result {
447                if self.result.is_some() {
448                    // Nothing more to do.
449                    return Err(fmt::Error);
450                }
451                while !s.is_empty() {
452                    if self.is_absolute.is_none() {
453                        // The first input.
454                        match s.strip_prefix('/') {
455                            Some(rest) => {
456                                self.is_absolute = Some(true);
457                                s = rest;
458                            }
459                            None => {
460                                self.is_absolute = Some(false);
461                            }
462                        }
463                        continue;
464                    }
465                    if self.is_absolute == Some(true) {
466                        let result = if s.starts_with('/') {
467                            PathCharacteristic::StartsWithDoubleSlash
468                        } else {
469                            PathCharacteristic::CommonAbsolute
470                        };
471                        self.result = Some(result);
472                        return Err(fmt::Error);
473                    }
474                    // Processing the first segment of the relative path.
475                    match find_split_hole(s, b'/') {
476                        Some((first_seg, _rest)) => {
477                            let result = if first_seg.contains(':') {
478                                PathCharacteristic::RelativeFirstSegmentHasColon
479                            } else {
480                                PathCharacteristic::CommonRelative
481                            };
482                            self.result = Some(result);
483                            return Err(fmt::Error);
484                        }
485                        None => {
486                            // `s` might not be the complete first segment.
487                            if s.contains(':') {
488                                self.result =
489                                    Some(PathCharacteristic::RelativeFirstSegmentHasColon);
490                                return Err(fmt::Error);
491                            }
492                            break;
493                        }
494                    }
495                }
496                Ok(())
497            }
498        }
499
500        let mut writer = Writer::default();
501        match path.fmt_write_normalize::<S, _>(&mut writer, op, authority_is_present) {
502            // Empty path.
503            Ok(_) => PathCharacteristic::CommonRelative,
504            Err(_) => writer
505                .result
506                .expect("[consistency] the formatting quits early by `Err` when the check is done"),
507        }
508    }
509}
510
511/// Path segment kind.
512#[derive(Debug, Clone, Copy, PartialEq, Eq)]
513enum SegmentKind {
514    /// `.` or the equivalents.
515    Dot,
516    /// `..` or the equivalents.
517    DotDot,
518    /// Other normal (not special) segments.
519    Normal,
520}
521
522impl SegmentKind {
523    /// Creates a new `SegmentKind` from the given segment name.
524    #[must_use]
525    fn from_segment(s: &str) -> Self {
526        match s {
527            "." | "%2E" | "%2e" => SegmentKind::Dot,
528            ".." | ".%2E" | ".%2e" | "%2E." | "%2E%2E" | "%2E%2e" | "%2e." | "%2e%2E"
529            | "%2e%2e" => SegmentKind::DotDot,
530            _ => SegmentKind::Normal,
531        }
532    }
533}
534
535/// A segment with optional leading slash.
536#[derive(Debug, Clone)]
537struct PathSegment {
538    /// Presence of a leading slash.
539    has_leading_slash: bool,
540    /// Range of the segment name (without any slashes).
541    range: Range<usize>,
542}
543
544impl PathSegment {
545    /// Returns the segment without any slashes.
546    #[inline]
547    #[must_use]
548    fn segment<'a>(&self, path: &PathToNormalize<'a>) -> &'a str {
549        if let Some(prefix) = path.0 {
550            let prefix_len = prefix.len();
551            if self.range.end <= prefix_len {
552                &prefix[self.range.clone()]
553            } else {
554                let range = (self.range.start - prefix_len)..(self.range.end - prefix_len);
555                &path.1[range]
556            }
557        } else {
558            &path.1[self.range.clone()]
559        }
560    }
561
562    /// Returns the segment kind.
563    #[inline]
564    #[must_use]
565    fn kind(&self, path: &PathToNormalize<'_>) -> SegmentKind {
566        SegmentKind::from_segment(self.segment(path))
567    }
568}
569
570/// Iterator of path segments.
571struct PathSegmentsIter<'a> {
572    /// Path.
573    path: &'a PathToNormalize<'a>,
574    /// Current cursor position.
575    cursor: usize,
576}
577
578impl<'a> PathSegmentsIter<'a> {
579    /// Creates a new iterator of path segments.
580    #[inline]
581    #[must_use]
582    fn new(path: &'a PathToNormalize<'a>) -> Self {
583        Self { path, cursor: 0 }
584    }
585}
586
587impl Iterator for PathSegmentsIter<'_> {
588    type Item = PathSegment;
589
590    fn next(&mut self) -> Option<Self::Item> {
591        let path_len = self.path.len();
592        if self.cursor >= path_len {
593            return None;
594        }
595        let has_leading_slash = self.path.byte_at(self.cursor) == Some(b'/');
596
597        let prefix_len = self.path.len_prefix();
598        if (prefix_len != 0) && (self.cursor == prefix_len - 1) {
599            debug_assert!(has_leading_slash);
600            let end = self.path.1.find('/').unwrap_or(self.path.1.len()) + prefix_len;
601            self.cursor = end;
602            return Some(PathSegment {
603                has_leading_slash,
604                range: prefix_len..end,
605            });
606        }
607
608        if has_leading_slash {
609            // Skip the leading slash.
610            self.cursor += 1;
611        };
612        let start = self.cursor;
613        self.cursor = self.path.find_next_slash(self.cursor).unwrap_or(path_len);
614
615        Some(PathSegment {
616            has_leading_slash,
617            range: start..self.cursor,
618        })
619    }
620}