rusty_detox/filter/mod.rs
1//! The Filter pipeline (FR-001..FR-007 + FR-039).
2//!
3//! Each `Filter` variant is a single-step byte-sequence transformation. A
4//! [`Sequence`](crate::Sequence) is an ordered `Vec<Filter>` consumed
5//! left-to-right when [`Detox::sanitize`](crate::Detox::sanitize) runs.
6
7pub mod iso8859_1;
8pub mod max_length;
9pub mod safe;
10pub mod safe_platform;
11pub mod uncgi;
12pub mod utf8;
13pub mod wipeup;
14
15/// Default unsafe-character set for [`Filter::Safe`] when callers want the
16/// upstream-compatible default. Includes path-separator byte `/` per
17/// clarification Q10 + FR-004.
18pub const DEFAULT_UNSAFE_CHARS: &[u8] = b" ()[]{}<>\'\"!@#$&*?;|\\/\x7f";
19
20/// Default separator byte for [`Filter::Wipeup`] (the byte that runs are
21/// collapsed and trimmed of). Matches upstream's `_` default.
22pub const DEFAULT_SEPARATOR: u8 = b'_';
23
24/// One transformation step in the [`Sequence`](crate::Sequence) pipeline.
25///
26/// `#[non_exhaustive]` is required (FR-039) so SemVer-minor releases can add
27/// new variants such as a future `--transliterate=deunicode` opt-in.
28///
29/// # Construction shortcut
30///
31/// For [`Filter::Safe`] with the default unsafe-character set, prefer
32/// [`Filter::safe_default()`] over enumerating the byte set manually.
33#[non_exhaustive]
34#[derive(Debug, Clone, PartialEq, Eq)]
35pub enum Filter {
36 /// Decode CGI percent-escapes (`%XX` → single byte). FR-001.
37 Uncgi,
38 /// Translate Latin-1 high bytes (0x80–0xFF) to ASCII via the vendored
39 /// `Table.iso8859_1`. FR-002.
40 Iso8859_1,
41 /// Translate UTF-8 codepoints to ASCII via the vendored `Table.utf_8`.
42 /// Unmapped codepoints pass through. FR-003.
43 Utf8,
44 /// Replace each unsafe-set byte with `replacement`. FR-004.
45 Safe {
46 /// Replacement byte (default `b'_'`).
47 replacement: u8,
48 /// Bytes considered unsafe. See [`DEFAULT_UNSAFE_CHARS`] for the
49 /// v0.1.0 default; callers MAY pass any byte set.
50 unsafe_chars: Vec<u8>,
51 },
52 /// Collapse runs of `separator` into one occurrence; when
53 /// `remove_trailing` is true, also trim leading/trailing runs. FR-005.
54 Wipeup {
55 /// Separator byte (default [`DEFAULT_SEPARATOR`]).
56 separator: u8,
57 /// When true, trim leading and trailing runs of `separator`.
58 remove_trailing: bool,
59 },
60 /// Truncate to `limit` bytes while preserving the final extension token
61 /// (everything after the last `.`). FR-006.
62 MaxLength {
63 /// Maximum total byte length of the basename.
64 limit: usize,
65 },
66 /// Rewrite Windows-reserved device names (CON, PRN, AUX, NUL, COM1–9,
67 /// LPT1–9) by suffixing the basename with `_`, and rewrite Windows-
68 /// reserved characters (`< > : " | ? *`) and ASCII control bytes using
69 /// the same replacement as [`Filter::Safe`]. FR-007.
70 ///
71 /// Auto-enabled on Windows builds; opt-in elsewhere via a sequence entry.
72 SafePlatform,
73}
74
75impl Filter {
76 /// Construct a [`Filter::Safe`] with the v0.1.0 default unsafe-character
77 /// set and `b'_'` replacement (FR-004 + clarification Q10). Convenience
78 /// constructor for callers who want the upstream-compatible default
79 /// without enumerating the byte set.
80 ///
81 /// # Examples
82 ///
83 /// ```
84 /// use rusty_detox::Filter;
85 ///
86 /// let safe = Filter::safe_default();
87 /// assert!(matches!(safe, Filter::Safe { replacement: b'_', .. }));
88 /// ```
89 pub fn safe_default() -> Self {
90 Filter::Safe {
91 replacement: b'_',
92 unsafe_chars: DEFAULT_UNSAFE_CHARS.to_vec(),
93 }
94 }
95
96 /// Construct a [`Filter::Wipeup`] with `b'_'` separator and trailing
97 /// trimming enabled (matches upstream's `default` sequence).
98 pub fn wipeup_default() -> Self {
99 Filter::Wipeup {
100 separator: DEFAULT_SEPARATOR,
101 remove_trailing: true,
102 }
103 }
104
105 /// Apply this single filter to `input`, returning the transformed bytes.
106 pub fn apply(&self, input: &[u8]) -> Vec<u8> {
107 match self {
108 Filter::Uncgi => uncgi::apply(input),
109 Filter::Iso8859_1 => iso8859_1::apply(input),
110 Filter::Utf8 => utf8::apply(input),
111 Filter::Safe {
112 replacement,
113 unsafe_chars,
114 } => safe::apply(input, *replacement, unsafe_chars),
115 Filter::Wipeup {
116 separator,
117 remove_trailing,
118 } => wipeup::apply(input, *separator, *remove_trailing),
119 Filter::MaxLength { limit } => max_length::apply(input, *limit),
120 Filter::SafePlatform => safe_platform::apply(input),
121 }
122 }
123}
124
125#[cfg(test)]
126mod tests {
127 use super::*;
128
129 #[test]
130 fn safe_default_matches_fr004() {
131 match Filter::safe_default() {
132 Filter::Safe {
133 replacement,
134 unsafe_chars,
135 } => {
136 assert_eq!(replacement, b'_');
137 assert!(unsafe_chars.contains(&b' '));
138 assert!(unsafe_chars.contains(&b'/')); // Q10
139 assert!(unsafe_chars.contains(&b'('));
140 }
141 _ => panic!("safe_default must return Filter::Safe"),
142 }
143 }
144
145 #[test]
146 fn wipeup_default_collapses_underscores() {
147 let f = Filter::wipeup_default();
148 assert_eq!(f.apply(b"a__b___c"), b"a_b_c");
149 }
150}