Skip to main content

rusty_detox/filter/
mod.rs

1//! The Filter pipeline (FR-001..FR-007 + FR-039).
2//!
3//! Each `Filter` variant is a single-step byte-sequence transformation. A
4//! [`Sequence`](crate::Sequence) is an ordered `Vec<Filter>` consumed
5//! left-to-right when [`Detox::sanitize`](crate::Detox::sanitize) runs.
6
7pub mod iso8859_1;
8pub mod max_length;
9pub mod safe;
10pub mod safe_platform;
11pub mod uncgi;
12pub mod utf8;
13pub mod wipeup;
14
15/// Default unsafe-character set for [`Filter::Safe`] when callers want the
16/// upstream-compatible default. Includes path-separator byte `/` per
17/// clarification Q10 + FR-004.
18pub const DEFAULT_UNSAFE_CHARS: &[u8] = b" ()[]{}<>\'\"!@#$&*?;|\\/\x7f";
19
20/// Default separator byte for [`Filter::Wipeup`] (the byte that runs are
21/// collapsed and trimmed of). Matches upstream's `_` default.
22pub const DEFAULT_SEPARATOR: u8 = b'_';
23
24/// One transformation step in the [`Sequence`](crate::Sequence) pipeline.
25///
26/// `#[non_exhaustive]` is required (FR-039) so SemVer-minor releases can add
27/// new variants such as a future `--transliterate=deunicode` opt-in.
28///
29/// # Construction shortcut
30///
31/// For [`Filter::Safe`] with the default unsafe-character set, prefer
32/// [`Filter::safe_default()`] over enumerating the byte set manually.
33#[non_exhaustive]
34#[derive(Debug, Clone, PartialEq, Eq)]
35pub enum Filter {
36    /// Decode CGI percent-escapes (`%XX` → single byte). FR-001.
37    Uncgi,
38    /// Translate Latin-1 high bytes (0x80–0xFF) to ASCII via the vendored
39    /// `Table.iso8859_1`. FR-002.
40    Iso8859_1,
41    /// Translate UTF-8 codepoints to ASCII via the vendored `Table.utf_8`.
42    /// Unmapped codepoints pass through. FR-003.
43    Utf8,
44    /// Replace each unsafe-set byte with `replacement`. FR-004.
45    Safe {
46        /// Replacement byte (default `b'_'`).
47        replacement: u8,
48        /// Bytes considered unsafe. See [`DEFAULT_UNSAFE_CHARS`] for the
49        /// v0.1.0 default; callers MAY pass any byte set.
50        unsafe_chars: Vec<u8>,
51    },
52    /// Collapse runs of `separator` into one occurrence; when
53    /// `remove_trailing` is true, also trim leading/trailing runs. FR-005.
54    Wipeup {
55        /// Separator byte (default [`DEFAULT_SEPARATOR`]).
56        separator: u8,
57        /// When true, trim leading and trailing runs of `separator`.
58        remove_trailing: bool,
59    },
60    /// Truncate to `limit` bytes while preserving the final extension token
61    /// (everything after the last `.`). FR-006.
62    MaxLength {
63        /// Maximum total byte length of the basename.
64        limit: usize,
65    },
66    /// Rewrite Windows-reserved device names (CON, PRN, AUX, NUL, COM1–9,
67    /// LPT1–9) by suffixing the basename with `_`, and rewrite Windows-
68    /// reserved characters (`< > : " | ? *`) and ASCII control bytes using
69    /// the same replacement as [`Filter::Safe`]. FR-007.
70    ///
71    /// Auto-enabled on Windows builds; opt-in elsewhere via a sequence entry.
72    SafePlatform,
73}
74
75impl Filter {
76    /// Construct a [`Filter::Safe`] with the v0.1.0 default unsafe-character
77    /// set and `b'_'` replacement (FR-004 + clarification Q10). Convenience
78    /// constructor for callers who want the upstream-compatible default
79    /// without enumerating the byte set.
80    ///
81    /// # Examples
82    ///
83    /// ```
84    /// use rusty_detox::Filter;
85    ///
86    /// let safe = Filter::safe_default();
87    /// assert!(matches!(safe, Filter::Safe { replacement: b'_', .. }));
88    /// ```
89    pub fn safe_default() -> Self {
90        Filter::Safe {
91            replacement: b'_',
92            unsafe_chars: DEFAULT_UNSAFE_CHARS.to_vec(),
93        }
94    }
95
96    /// Construct a [`Filter::Wipeup`] with `b'_'` separator and trailing
97    /// trimming enabled (matches upstream's `default` sequence).
98    pub fn wipeup_default() -> Self {
99        Filter::Wipeup {
100            separator: DEFAULT_SEPARATOR,
101            remove_trailing: true,
102        }
103    }
104
105    /// Apply this single filter to `input`, returning the transformed bytes.
106    pub fn apply(&self, input: &[u8]) -> Vec<u8> {
107        match self {
108            Filter::Uncgi => uncgi::apply(input),
109            Filter::Iso8859_1 => iso8859_1::apply(input),
110            Filter::Utf8 => utf8::apply(input),
111            Filter::Safe {
112                replacement,
113                unsafe_chars,
114            } => safe::apply(input, *replacement, unsafe_chars),
115            Filter::Wipeup {
116                separator,
117                remove_trailing,
118            } => wipeup::apply(input, *separator, *remove_trailing),
119            Filter::MaxLength { limit } => max_length::apply(input, *limit),
120            Filter::SafePlatform => safe_platform::apply(input),
121        }
122    }
123}
124
125#[cfg(test)]
126mod tests {
127    use super::*;
128
129    #[test]
130    fn safe_default_matches_fr004() {
131        match Filter::safe_default() {
132            Filter::Safe {
133                replacement,
134                unsafe_chars,
135            } => {
136                assert_eq!(replacement, b'_');
137                assert!(unsafe_chars.contains(&b' '));
138                assert!(unsafe_chars.contains(&b'/')); // Q10
139                assert!(unsafe_chars.contains(&b'('));
140            }
141            _ => panic!("safe_default must return Filter::Safe"),
142        }
143    }
144
145    #[test]
146    fn wipeup_default_collapses_underscores() {
147        let f = Filter::wipeup_default();
148        assert_eq!(f.apply(b"a__b___c"), b"a_b_c");
149    }
150}