1fn is_soft_hyphen(c: char) -> bool {
2 c == '\u{00AD}'
3}
4
5fn is_combining_grapheme_joiner(c: char) -> bool {
6 c == '\u{034F}'
7}
8
9fn is_arabic_letter_mark(c: char) -> bool {
10 c == '\u{061C}'
11}
12
13fn is_hangul_filler(c: char) -> bool {
14 matches!(c, '\u{115F}'..='\u{1160}' | '\u{3164}' | '\u{FFA0}')
15}
16
17fn is_khmer_invisible_vowel(c: char) -> bool {
18 matches!(c, '\u{17B4}'..='\u{17B5}')
19}
20
21fn is_mongolian_free_variation_selector(c: char) -> bool {
22 matches!(c, '\u{180B}'..='\u{180D}' | '\u{180F}')
23}
24
25fn is_zero_width(c: char) -> bool {
26 matches!(c, '\u{200B}'..='\u{200F}')
27}
28
29fn is_bidi_control(c: char) -> bool {
30 matches!(c, '\u{202A}'..='\u{202E}')
31}
32
33fn is_invisible_operator(c: char) -> bool {
34 matches!(c, '\u{2060}'..='\u{206F}')
35}
36
37fn is_variation_selector(c: char) -> bool {
38 matches!(c, '\u{FE00}'..='\u{FE0F}')
39}
40
41fn is_bom(c: char) -> bool {
42 c == '\u{FEFF}'
43}
44
45fn is_interlinear_annotation(c: char) -> bool {
46 matches!(c, '\u{FFF9}'..='\u{FFFB}')
47}
48
49fn is_tag(c: char) -> bool {
50 matches!(c, '\u{E0000}'..='\u{E007F}')
51}
52
53pub fn is_unicode_line_separator(c: char) -> bool {
59 matches!(c, '\u{0085}' | '\u{2028}'..='\u{2029}')
60}
61
62pub fn is_deceptive_unicode(c: char) -> bool {
68 is_soft_hyphen(c)
69 || is_combining_grapheme_joiner(c)
70 || is_arabic_letter_mark(c)
71 || is_hangul_filler(c)
72 || is_khmer_invisible_vowel(c)
73 || is_mongolian_free_variation_selector(c)
74 || is_zero_width(c)
75 || is_bidi_control(c)
76 || is_invisible_operator(c)
77 || is_variation_selector(c)
78 || is_bom(c)
79 || is_interlinear_annotation(c)
80 || is_tag(c)
81}
82
83pub fn is_unsafe_for_display(c: char) -> bool {
89 c.is_ascii_control()
90 || is_unicode_line_separator(c)
91 || is_deceptive_unicode(c)
92}
93
94pub fn sanitize_for_display(s: &str) -> String {
98 s.chars().filter(|&c| !is_unsafe_for_display(c)).collect()
99}
100
101pub fn sanitize_multiline_for_display(s: &str) -> String {
106 s.chars()
107 .filter(|&c| c == '\n' || c == '\r' || c == '\t' || !is_unsafe_for_display(c))
108 .collect()
109}
110
111pub fn double_quote_escape(c: char) -> Option<&'static str> {
118 match c {
119 '\\' => Some("\\\\"),
120 '"' => Some("\\\""),
121 '\n' => Some("\\n"),
122 '\r' => Some("\\r"),
123 '\t' => Some("\\t"),
124 _ => None,
125 }
126}
127
128pub fn is_nu_drop_char(c: char) -> bool {
135 !matches!(c, '\n' | '\r' | '\t') && is_unsafe_for_display(c)
136}
137
138#[cfg(test)]
139mod tests {
140 use super::*;
141
142 #[test]
143 fn is_deceptive_unicode_detects_rlo() {
144 assert!(is_deceptive_unicode('\u{202E}'));
145 }
146
147 #[test]
148 fn is_deceptive_unicode_detects_bom() {
149 assert!(is_deceptive_unicode('\u{FEFF}'));
150 }
151
152 #[test]
153 fn is_deceptive_unicode_detects_zwsp() {
154 assert!(is_deceptive_unicode('\u{200B}'));
155 }
156
157 #[test]
158 fn is_deceptive_unicode_allows_normal_chars() {
159 assert!(!is_deceptive_unicode('a'));
160 assert!(!is_deceptive_unicode(' '));
161 assert!(!is_deceptive_unicode('é'));
162 }
163
164 #[test]
165 fn is_unsafe_for_display_detects_esc() {
166 assert!(is_unsafe_for_display('\x1B'));
167 }
168
169 #[test]
170 fn is_unsafe_for_display_detects_bel() {
171 assert!(is_unsafe_for_display('\x07'));
172 }
173
174 #[test]
175 fn is_unsafe_for_display_detects_del() {
176 assert!(is_unsafe_for_display('\x7F'));
177 }
178
179 #[test]
180 fn is_unsafe_for_display_detects_rlo() {
181 assert!(is_unsafe_for_display('\u{202E}'));
182 }
183
184 #[test]
185 fn is_unsafe_for_display_detects_bom() {
186 assert!(is_unsafe_for_display('\u{FEFF}'));
187 }
188
189 #[test]
190 fn is_unsafe_for_display_detects_zwsp() {
191 assert!(is_unsafe_for_display('\u{200B}'));
192 }
193
194 #[test]
195 fn is_unsafe_for_display_allows_normal_chars() {
196 assert!(!is_unsafe_for_display('a'));
197 assert!(!is_unsafe_for_display(' '));
198 assert!(!is_unsafe_for_display('é'));
199 }
200
201 #[test]
202 fn sanitize_for_display_strips_control_chars() {
203 assert_eq!(sanitize_for_display("he\x1Bllo"), "hello");
204 }
205
206 #[test]
207 fn sanitize_for_display_strips_rlo() {
208 assert_eq!(sanitize_for_display("he\u{202E}llo"), "hello");
209 }
210
211 #[test]
212 fn sanitize_for_display_preserves_normal_text() {
213 assert_eq!(sanitize_for_display("hello world"), "hello world");
214 }
215
216 #[test]
217 fn is_nu_drop_char_drops_all_ascii_control() {
218 for b in 0u8..=0x1F {
219 let c = b as char;
220 if matches!(c, '\n' | '\r' | '\t') {
221 assert!(!is_nu_drop_char(c), "U+{:04X} must not be dropped (it is escaped)", b);
222 } else {
223 assert!(is_nu_drop_char(c), "U+{:04X} must be dropped", b);
224 }
225 }
226 assert!(is_nu_drop_char('\x7F'), "DEL must be dropped");
227 }
228
229 #[test]
230 fn is_nu_drop_char_drops_nel() {
231 assert!(is_nu_drop_char('\u{0085}'));
232 }
233
234 #[test]
235 fn is_nu_drop_char_drops_line_separator() {
236 assert!(is_nu_drop_char('\u{2028}'));
237 }
238
239 #[test]
240 fn is_nu_drop_char_drops_paragraph_separator() {
241 assert!(is_nu_drop_char('\u{2029}'));
242 }
243
244 #[test]
245 fn is_nu_drop_char_drops_rlo() {
246 assert!(is_nu_drop_char('\u{202E}'));
247 }
248
249 #[test]
250 fn is_nu_drop_char_drops_bom() {
251 assert!(is_nu_drop_char('\u{FEFF}'));
252 }
253
254 #[test]
255 fn is_nu_drop_char_drops_zwsp() {
256 assert!(is_nu_drop_char('\u{200B}'));
257 }
258
259 #[test]
260 fn is_nu_drop_char_preserves_newline() {
261 assert!(!is_nu_drop_char('\n'));
262 }
263
264 #[test]
265 fn is_nu_drop_char_preserves_carriage_return() {
266 assert!(!is_nu_drop_char('\r'));
267 }
268
269 #[test]
270 fn is_nu_drop_char_preserves_tab() {
271 assert!(!is_nu_drop_char('\t'));
272 }
273
274 #[test]
275 fn double_quote_escape_escapes_backslash() {
276 assert!(double_quote_escape('\\').is_some());
277 }
278
279 #[test]
280 fn double_quote_escape_escapes_double_quote() {
281 assert!(double_quote_escape('"').is_some());
282 }
283
284 #[test]
285 fn double_quote_escape_escapes_newline() {
286 assert!(double_quote_escape('\n').is_some());
287 }
288
289 #[test]
290 fn double_quote_escape_escapes_carriage_return() {
291 assert!(double_quote_escape('\r').is_some());
292 }
293
294 #[test]
295 fn double_quote_escape_escapes_tab() {
296 assert!(double_quote_escape('\t').is_some());
297 }
298
299 #[test]
300 fn double_quote_escape_ignores_letter() {
301 assert!(double_quote_escape('a').is_none());
302 }
303
304 #[test]
305 fn double_quote_escape_ignores_dollar() {
306 assert!(double_quote_escape('$').is_none());
307 }
308
309 #[test]
310 fn double_quote_escape_ignores_nul() {
311 assert!(double_quote_escape('\0').is_none());
312 }
313}