include!(concat!(env!("OUT_DIR"), "/sanitize_table.rs"));
#[inline]
fn is_allowed(cp: u32) -> bool {
match SANITIZE_STAGE1.get((cp >> 8) as usize) {
Some(&leaf) => {
let lo = (cp & 0xFF) as usize;
(SANITIZE_STAGE2[leaf as usize][lo >> 3] >> (lo & 7)) & 1 != 0
}
None => false,
}
}
const FORBIDDEN_EMOJI: &[char] = &['🏴'];
#[cfg(not(feature = "bidi"))]
const FORBIDDEN_BIDI: &[char] = &[
'\u{200B}', '\u{200C}', '\u{200D}', '\u{200E}', '\u{200F}', '\u{202A}', '\u{202B}', '\u{202C}', '\u{202D}', '\u{202E}', '\u{2060}', '\u{2061}', '\u{2062}', '\u{2063}', '\u{2064}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{2069}', '\u{206A}', '\u{206B}', '\u{206C}', '\u{206D}', '\u{206E}', '\u{206F}', ];
#[cfg(feature = "bidi")]
const FORBIDDEN_BIDI: &[char] = &[];
pub fn sanitize(s: &str) -> Option<String> {
let mut first_invalid = None;
let mut last_invalid = None;
for (i, c) in s.char_indices() {
if FORBIDDEN_EMOJI.contains(&c) || FORBIDDEN_BIDI.contains(&c) || !is_allowed(c as u32) {
if let Some(_) = first_invalid {
last_invalid = Some(i);
continue;
} else {
first_invalid = Some(i);
last_invalid = Some(i);
continue;
}
}
}
if let (Some(first), Some(last)) = (first_invalid, last_invalid) {
let begin = &s[..first];
let last = last + s[last..].chars().next().map(|c| c.len_utf8()).unwrap_or(0);
let end = &s[last..];
let sanitized = if cfg!(feature = "verbose") {
format!("{}[{} BYTES SANITIZED]{}", begin, last - first, end)
} else {
format!("{}{}", begin, end)
};
Some(sanitized)
} else {
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn table_matches_ranges() {
for cp in 0u32..=0x10_FFFF {
let by_table = is_allowed(cp);
let by_ranges = crate::ranges::ENABLED_RANGES
.iter()
.any(|r| r.contains(&cp));
assert_eq!(
by_table, by_ranges,
"allow-set mismatch at U+{cp:04X}: table={by_table} ranges={by_ranges}"
);
}
}
#[test]
fn test_sanitize() {
assert_eq!(sanitize("hello \t\n"), None);
#[cfg(feature = "latin-1-supplement")]
assert_eq!(sanitize("Ā"), None);
#[cfg(all(not(feature = "latin-1-supplement"), feature = "verbose"))]
assert_eq!(sanitize("Ā"), Some("[2 BYTES SANITIZED]".to_string()));
#[cfg(all(not(feature = "latin-1-supplement"), not(feature = "verbose")))]
assert_eq!(sanitize("Ā"), Some("".to_string()));
#[cfg(all(not(feature = "tags"), feature = "verbose"))]
assert_eq!(
sanitize("https://wuzzi.net/copirate/"),
Some("https://wuzzi.net/copirate/[156 BYTES SANITIZED]".to_string())
);
#[cfg(all(not(feature = "tags"), not(feature = "verbose")))]
assert_eq!(
sanitize("https://wuzzi.net/copirate/"),
Some("https://wuzzi.net/copirate/".to_string())
);
#[cfg(not(feature = "verbose"))]
assert_eq!(sanitize("🏴").unwrap(), "");
#[cfg(feature = "emoji")]
assert_eq!(sanitize("👍"), None);
#[cfg(feature = "emoji")]
assert_eq!(sanitize("🙏"), None);
}
#[test]
#[cfg(all(not(feature = "bidi"), feature = "general-punctuation"))]
fn test_bidi_denied_with_general_punctuation() {
assert!(
sanitize("hello \u{2014} world").is_none(),
"em dash should be allowed"
);
assert!(
sanitize("hello\u{200B}world").is_some(),
"zero-width space should be denied"
);
assert!(
sanitize("hello\u{202E}world").is_some(),
"RTL override should be denied"
);
assert!(
sanitize("hello\u{200E}world").is_some(),
"LTR mark should be denied"
);
}
}