fn is_soft_hyphen(c: char) -> bool {
c == '\u{00AD}'
}
fn is_combining_grapheme_joiner(c: char) -> bool {
c == '\u{034F}'
}
fn is_arabic_letter_mark(c: char) -> bool {
c == '\u{061C}'
}
fn is_hangul_filler(c: char) -> bool {
matches!(c, '\u{115F}'..='\u{1160}' | '\u{3164}' | '\u{FFA0}')
}
fn is_khmer_invisible_vowel(c: char) -> bool {
matches!(c, '\u{17B4}'..='\u{17B5}')
}
fn is_mongolian_free_variation_selector(c: char) -> bool {
matches!(c, '\u{180B}'..='\u{180D}' | '\u{180F}')
}
fn is_zero_width(c: char) -> bool {
matches!(c, '\u{200B}'..='\u{200F}')
}
fn is_bidi_control(c: char) -> bool {
matches!(c, '\u{202A}'..='\u{202E}')
}
fn is_invisible_operator(c: char) -> bool {
matches!(c, '\u{2060}'..='\u{206F}')
}
fn is_variation_selector(c: char) -> bool {
matches!(c, '\u{FE00}'..='\u{FE0F}')
}
fn is_bom(c: char) -> bool {
c == '\u{FEFF}'
}
fn is_interlinear_annotation(c: char) -> bool {
matches!(c, '\u{FFF9}'..='\u{FFFB}')
}
fn is_tag(c: char) -> bool {
matches!(c, '\u{E0000}'..='\u{E007F}')
}
pub fn is_unicode_line_separator(c: char) -> bool {
matches!(c, '\u{0085}' | '\u{2028}'..='\u{2029}')
}
pub fn is_deceptive_unicode(c: char) -> bool {
is_soft_hyphen(c)
|| is_combining_grapheme_joiner(c)
|| is_arabic_letter_mark(c)
|| is_hangul_filler(c)
|| is_khmer_invisible_vowel(c)
|| is_mongolian_free_variation_selector(c)
|| is_zero_width(c)
|| is_bidi_control(c)
|| is_invisible_operator(c)
|| is_variation_selector(c)
|| is_bom(c)
|| is_interlinear_annotation(c)
|| is_tag(c)
}
pub fn is_unsafe_for_display(c: char) -> bool {
c.is_ascii_control()
|| is_unicode_line_separator(c)
|| is_deceptive_unicode(c)
}
pub fn sanitize_for_display(s: &str) -> String {
s.chars().filter(|&c| !is_unsafe_for_display(c)).collect()
}
pub fn sanitize_multiline_for_display(s: &str) -> String {
s.chars()
.filter(|&c| c == '\n' || c == '\r' || c == '\t' || !is_unsafe_for_display(c))
.collect()
}
pub fn double_quote_escape(c: char) -> Option<&'static str> {
match c {
'\\' => Some("\\\\"),
'"' => Some("\\\""),
'\n' => Some("\\n"),
'\r' => Some("\\r"),
'\t' => Some("\\t"),
_ => None,
}
}
pub fn is_nu_drop_char(c: char) -> bool {
!matches!(c, '\n' | '\r' | '\t') && is_unsafe_for_display(c)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn is_deceptive_unicode_detects_rlo() {
assert!(is_deceptive_unicode('\u{202E}'));
}
#[test]
fn is_deceptive_unicode_detects_bom() {
assert!(is_deceptive_unicode('\u{FEFF}'));
}
#[test]
fn is_deceptive_unicode_detects_zwsp() {
assert!(is_deceptive_unicode('\u{200B}'));
}
#[test]
fn is_deceptive_unicode_allows_normal_chars() {
assert!(!is_deceptive_unicode('a'));
assert!(!is_deceptive_unicode(' '));
assert!(!is_deceptive_unicode('é'));
}
#[test]
fn is_unsafe_for_display_detects_esc() {
assert!(is_unsafe_for_display('\x1B'));
}
#[test]
fn is_unsafe_for_display_detects_bel() {
assert!(is_unsafe_for_display('\x07'));
}
#[test]
fn is_unsafe_for_display_detects_del() {
assert!(is_unsafe_for_display('\x7F'));
}
#[test]
fn is_unsafe_for_display_detects_rlo() {
assert!(is_unsafe_for_display('\u{202E}'));
}
#[test]
fn is_unsafe_for_display_detects_bom() {
assert!(is_unsafe_for_display('\u{FEFF}'));
}
#[test]
fn is_unsafe_for_display_detects_zwsp() {
assert!(is_unsafe_for_display('\u{200B}'));
}
#[test]
fn is_unsafe_for_display_allows_normal_chars() {
assert!(!is_unsafe_for_display('a'));
assert!(!is_unsafe_for_display(' '));
assert!(!is_unsafe_for_display('é'));
}
#[test]
fn sanitize_for_display_strips_control_chars() {
assert_eq!(sanitize_for_display("he\x1Bllo"), "hello");
}
#[test]
fn sanitize_for_display_strips_rlo() {
assert_eq!(sanitize_for_display("he\u{202E}llo"), "hello");
}
#[test]
fn sanitize_for_display_preserves_normal_text() {
assert_eq!(sanitize_for_display("hello world"), "hello world");
}
#[test]
fn is_nu_drop_char_drops_all_ascii_control() {
for b in 0u8..=0x1F {
let c = b as char;
if matches!(c, '\n' | '\r' | '\t') {
assert!(!is_nu_drop_char(c), "U+{:04X} must not be dropped (it is escaped)", b);
} else {
assert!(is_nu_drop_char(c), "U+{:04X} must be dropped", b);
}
}
assert!(is_nu_drop_char('\x7F'), "DEL must be dropped");
}
#[test]
fn is_nu_drop_char_drops_nel() {
assert!(is_nu_drop_char('\u{0085}'));
}
#[test]
fn is_nu_drop_char_drops_line_separator() {
assert!(is_nu_drop_char('\u{2028}'));
}
#[test]
fn is_nu_drop_char_drops_paragraph_separator() {
assert!(is_nu_drop_char('\u{2029}'));
}
#[test]
fn is_nu_drop_char_drops_rlo() {
assert!(is_nu_drop_char('\u{202E}'));
}
#[test]
fn is_nu_drop_char_drops_bom() {
assert!(is_nu_drop_char('\u{FEFF}'));
}
#[test]
fn is_nu_drop_char_drops_zwsp() {
assert!(is_nu_drop_char('\u{200B}'));
}
#[test]
fn is_nu_drop_char_preserves_newline() {
assert!(!is_nu_drop_char('\n'));
}
#[test]
fn is_nu_drop_char_preserves_carriage_return() {
assert!(!is_nu_drop_char('\r'));
}
#[test]
fn is_nu_drop_char_preserves_tab() {
assert!(!is_nu_drop_char('\t'));
}
#[test]
fn double_quote_escape_escapes_backslash() {
assert!(double_quote_escape('\\').is_some());
}
#[test]
fn double_quote_escape_escapes_double_quote() {
assert!(double_quote_escape('"').is_some());
}
#[test]
fn double_quote_escape_escapes_newline() {
assert!(double_quote_escape('\n').is_some());
}
#[test]
fn double_quote_escape_escapes_carriage_return() {
assert!(double_quote_escape('\r').is_some());
}
#[test]
fn double_quote_escape_escapes_tab() {
assert!(double_quote_escape('\t').is_some());
}
#[test]
fn double_quote_escape_ignores_letter() {
assert!(double_quote_escape('a').is_none());
}
#[test]
fn double_quote_escape_ignores_dollar() {
assert!(double_quote_escape('$').is_none());
}
#[test]
fn double_quote_escape_ignores_nul() {
assert!(double_quote_escape('\0').is_none());
}
}