Skip to main content

runex_core/
sanitize.rs

1fn is_soft_hyphen(c: char) -> bool {
2    c == '\u{00AD}'
3}
4
5fn is_combining_grapheme_joiner(c: char) -> bool {
6    c == '\u{034F}'
7}
8
9fn is_arabic_letter_mark(c: char) -> bool {
10    c == '\u{061C}'
11}
12
13fn is_hangul_filler(c: char) -> bool {
14    matches!(c, '\u{115F}'..='\u{1160}' | '\u{3164}' | '\u{FFA0}')
15}
16
17fn is_khmer_invisible_vowel(c: char) -> bool {
18    matches!(c, '\u{17B4}'..='\u{17B5}')
19}
20
21fn is_mongolian_free_variation_selector(c: char) -> bool {
22    matches!(c, '\u{180B}'..='\u{180D}' | '\u{180F}')
23}
24
25fn is_zero_width(c: char) -> bool {
26    matches!(c, '\u{200B}'..='\u{200F}')
27}
28
29fn is_bidi_control(c: char) -> bool {
30    matches!(c, '\u{202A}'..='\u{202E}')
31}
32
33fn is_invisible_operator(c: char) -> bool {
34    matches!(c, '\u{2060}'..='\u{206F}')
35}
36
37fn is_variation_selector(c: char) -> bool {
38    matches!(c, '\u{FE00}'..='\u{FE0F}')
39}
40
41fn is_bom(c: char) -> bool {
42    c == '\u{FEFF}'
43}
44
45fn is_interlinear_annotation(c: char) -> bool {
46    matches!(c, '\u{FFF9}'..='\u{FFFB}')
47}
48
49fn is_tag(c: char) -> bool {
50    matches!(c, '\u{E0000}'..='\u{E007F}')
51}
52
53/// Returns true if `c` is a Unicode line or paragraph separator.
54///
55/// Covers NEL (U+0085), Line Separator (U+2028), and Paragraph Separator (U+2029).
56/// These behave like newlines in some runtimes and must be dropped from shell
57/// string literals where they cannot be safely escaped.
58pub fn is_unicode_line_separator(c: char) -> bool {
59    matches!(c, '\u{0085}' | '\u{2028}'..='\u{2029}')
60}
61
62/// Returns true if `c` is a Unicode visual-deception character.
63///
64/// These characters are invisible or visually ambiguous in terminal output and
65/// can be used to mislead users about the content of a string (e.g., RLO for
66/// right-to-left override, BOM, zero-width spaces).
67pub fn is_deceptive_unicode(c: char) -> bool {
68    is_soft_hyphen(c)
69        || is_combining_grapheme_joiner(c)
70        || is_arabic_letter_mark(c)
71        || is_hangul_filler(c)
72        || is_khmer_invisible_vowel(c)
73        || is_mongolian_free_variation_selector(c)
74        || is_zero_width(c)
75        || is_bidi_control(c)
76        || is_invisible_operator(c)
77        || is_variation_selector(c)
78        || is_bom(c)
79        || is_interlinear_annotation(c)
80        || is_tag(c)
81}
82
83/// Returns true if `c` should be removed before printing to a terminal.
84///
85/// This is a superset of [`is_deceptive_unicode`]: it also covers ASCII control
86/// characters (which can move the cursor, clear the screen, etc.) and Unicode
87/// line/paragraph separators that behave like newlines.
88pub fn is_unsafe_for_display(c: char) -> bool {
89    c.is_ascii_control()
90        || is_unicode_line_separator(c)
91        || is_deceptive_unicode(c)
92}
93
94/// Strip characters unsafe for terminal display from a string.
95///
96/// Removes all characters for which [`is_unsafe_for_display`] returns `true`.
97pub fn sanitize_for_display(s: &str) -> String {
98    s.chars().filter(|&c| !is_unsafe_for_display(c)).collect()
99}
100
101/// Strip characters unsafe for terminal display, preserving newlines and tabs.
102///
103/// Like [`sanitize_for_display`] but allows `\n`, `\r`, and `\t` so that
104/// multi-line messages (e.g. TOML parse errors) remain readable.
105pub fn sanitize_multiline_for_display(s: &str) -> String {
106    s.chars()
107        .filter(|&c| c == '\n' || c == '\r' || c == '\t' || !is_unsafe_for_display(c))
108        .collect()
109}
110
111/// Map a character to its double-quoted string escape sequence.
112///
113/// Returns `Some(escaped)` for the five characters that need escaping inside a
114/// double-quoted string literal (`\`, `"`, `\n`, `\r`, `\t`), and `None` for
115/// everything else.  Used by Nu and Lua quote functions to avoid repeating the
116/// same five `match` arms.
117pub fn double_quote_escape(c: char) -> Option<&'static str> {
118    match c {
119        '\\' => Some("\\\\"),
120        '"' => Some("\\\""),
121        '\n' => Some("\\n"),
122        '\r' => Some("\\r"),
123        '\t' => Some("\\t"),
124        _ => None,
125    }
126}
127
128/// Returns true if `c` should be silently dropped when building a Nu double-quoted
129/// string (`"..."` or `^"..."`).
130///
131/// Nu string escaping handles `\n`, `\r`, and `\t` as explicit two-character
132/// sequences, so those three are excluded here.  Everything else that is unsafe
133/// for terminal display is dropped rather than escaped.
134pub fn is_nu_drop_char(c: char) -> bool {
135    !matches!(c, '\n' | '\r' | '\t') && is_unsafe_for_display(c)
136}
137
138#[cfg(test)]
139mod tests {
140    use super::*;
141
142    #[test]
143    fn is_deceptive_unicode_detects_rlo() {
144        assert!(is_deceptive_unicode('\u{202E}'));
145    }
146
147    #[test]
148    fn is_deceptive_unicode_detects_bom() {
149        assert!(is_deceptive_unicode('\u{FEFF}'));
150    }
151
152    #[test]
153    fn is_deceptive_unicode_detects_zwsp() {
154        assert!(is_deceptive_unicode('\u{200B}'));
155    }
156
157    #[test]
158    fn is_deceptive_unicode_allows_normal_chars() {
159        assert!(!is_deceptive_unicode('a'));
160        assert!(!is_deceptive_unicode(' '));
161        assert!(!is_deceptive_unicode('é'));
162    }
163
164    #[test]
165    fn is_unsafe_for_display_detects_esc() {
166        assert!(is_unsafe_for_display('\x1B'));
167    }
168
169    #[test]
170    fn is_unsafe_for_display_detects_bel() {
171        assert!(is_unsafe_for_display('\x07'));
172    }
173
174    #[test]
175    fn is_unsafe_for_display_detects_del() {
176        assert!(is_unsafe_for_display('\x7F'));
177    }
178
179    #[test]
180    fn is_unsafe_for_display_detects_rlo() {
181        assert!(is_unsafe_for_display('\u{202E}'));
182    }
183
184    #[test]
185    fn is_unsafe_for_display_detects_bom() {
186        assert!(is_unsafe_for_display('\u{FEFF}'));
187    }
188
189    #[test]
190    fn is_unsafe_for_display_detects_zwsp() {
191        assert!(is_unsafe_for_display('\u{200B}'));
192    }
193
194    #[test]
195    fn is_unsafe_for_display_allows_normal_chars() {
196        assert!(!is_unsafe_for_display('a'));
197        assert!(!is_unsafe_for_display(' '));
198        assert!(!is_unsafe_for_display('é'));
199    }
200
201    #[test]
202    fn sanitize_for_display_strips_control_chars() {
203        assert_eq!(sanitize_for_display("he\x1Bllo"), "hello");
204    }
205
206    #[test]
207    fn sanitize_for_display_strips_rlo() {
208        assert_eq!(sanitize_for_display("he\u{202E}llo"), "hello");
209    }
210
211    #[test]
212    fn sanitize_for_display_preserves_normal_text() {
213        assert_eq!(sanitize_for_display("hello world"), "hello world");
214    }
215
216    #[test]
217    fn is_nu_drop_char_drops_all_ascii_control() {
218        for b in 0u8..=0x1F {
219            let c = b as char;
220            if matches!(c, '\n' | '\r' | '\t') {
221                assert!(!is_nu_drop_char(c), "U+{:04X} must not be dropped (it is escaped)", b);
222            } else {
223                assert!(is_nu_drop_char(c), "U+{:04X} must be dropped", b);
224            }
225        }
226        assert!(is_nu_drop_char('\x7F'), "DEL must be dropped");
227    }
228
229    #[test]
230    fn is_nu_drop_char_drops_nel() {
231        assert!(is_nu_drop_char('\u{0085}'));
232    }
233
234    #[test]
235    fn is_nu_drop_char_drops_line_separator() {
236        assert!(is_nu_drop_char('\u{2028}'));
237    }
238
239    #[test]
240    fn is_nu_drop_char_drops_paragraph_separator() {
241        assert!(is_nu_drop_char('\u{2029}'));
242    }
243
244    #[test]
245    fn is_nu_drop_char_drops_rlo() {
246        assert!(is_nu_drop_char('\u{202E}'));
247    }
248
249    #[test]
250    fn is_nu_drop_char_drops_bom() {
251        assert!(is_nu_drop_char('\u{FEFF}'));
252    }
253
254    #[test]
255    fn is_nu_drop_char_drops_zwsp() {
256        assert!(is_nu_drop_char('\u{200B}'));
257    }
258
259    #[test]
260    fn is_nu_drop_char_preserves_newline() {
261        assert!(!is_nu_drop_char('\n'));
262    }
263
264    #[test]
265    fn is_nu_drop_char_preserves_carriage_return() {
266        assert!(!is_nu_drop_char('\r'));
267    }
268
269    #[test]
270    fn is_nu_drop_char_preserves_tab() {
271        assert!(!is_nu_drop_char('\t'));
272    }
273
274    #[test]
275    fn double_quote_escape_escapes_backslash() {
276        assert!(double_quote_escape('\\').is_some());
277    }
278
279    #[test]
280    fn double_quote_escape_escapes_double_quote() {
281        assert!(double_quote_escape('"').is_some());
282    }
283
284    #[test]
285    fn double_quote_escape_escapes_newline() {
286        assert!(double_quote_escape('\n').is_some());
287    }
288
289    #[test]
290    fn double_quote_escape_escapes_carriage_return() {
291        assert!(double_quote_escape('\r').is_some());
292    }
293
294    #[test]
295    fn double_quote_escape_escapes_tab() {
296        assert!(double_quote_escape('\t').is_some());
297    }
298
299    #[test]
300    fn double_quote_escape_ignores_letter() {
301        assert!(double_quote_escape('a').is_none());
302    }
303
304    #[test]
305    fn double_quote_escape_ignores_dollar() {
306        assert!(double_quote_escape('$').is_none());
307    }
308
309    #[test]
310    fn double_quote_escape_ignores_nul() {
311        assert!(double_quote_escape('\0').is_none());
312    }
313}