pub fn sanitize_surrogates(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut chars = text.chars().peekable();
while let Some(ch) = chars.next() {
let code = ch as u32;
if (0xD800..=0xDBFF).contains(&code) {
if let Some(&next_ch) = chars.peek() {
let next_code = next_ch as u32;
if (0xDC00..=0xDFFF).contains(&next_code) {
result.push(ch);
result.push(chars.next().expect("peeked char exists"));
continue;
}
}
continue;
}
if (0xDC00..=0xDFFF).contains(&code) {
continue;
}
result.push(ch);
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_valid_emoji_preserved() {
assert_eq!(sanitize_surrogates("Hello π World"), "Hello π World");
}
#[test]
fn test_normal_text_unchanged() {
assert_eq!(sanitize_surrogates("Hello, world!"), "Hello, world!");
}
#[test]
fn test_empty_string() {
assert_eq!(sanitize_surrogates(""), "");
}
#[test]
fn test_ascii_preserved() {
assert_eq!(sanitize_surrogates("abc123!@#"), "abc123!@#");
}
#[test]
fn test_multiple_emoji_preserved() {
assert_eq!(sanitize_surrogates("ππβ¨π₯"), "ππβ¨π₯");
}
#[test]
fn test_cjk_characters_preserved() {
assert_eq!(sanitize_surrogates("δ½ ε₯½δΈη"), "δ½ ε₯½δΈη");
}
#[test]
fn test_unpaired_high_surrogate_removed() {
let input_bytes: &[u8] = b"Text ";
let mut bytes: Vec<u8> = input_bytes.to_vec();
bytes.extend_from_slice(&[0xED, 0xA0, 0x80]);
bytes.extend_from_slice(b" here");
let input = String::from_utf8_lossy(&bytes).into_owned();
let result = sanitize_surrogates(&input);
assert!(result.contains("Text"));
assert!(result.contains("here"));
}
#[test]
fn test_unpaired_low_surrogate_removed() {
let input_bytes: &[u8] = b"Text ";
let mut bytes: Vec<u8> = input_bytes.to_vec();
bytes.extend_from_slice(&[0xED, 0xB0, 0x80]);
bytes.extend_from_slice(b" here");
let input = String::from_utf8_lossy(&bytes).into_owned();
let result = sanitize_surrogates(&input);
assert!(result.contains("Text"));
assert!(result.contains("here"));
}
#[test]
fn test_trailing_unpaired_high_surrogate() {
let input_bytes: &[u8] = b"Hello";
let mut bytes: Vec<u8> = input_bytes.to_vec();
bytes.extend_from_slice(&[0xED, 0xA0, 0x80]); let input = String::from_utf8_lossy(&bytes).into_owned();
let result = sanitize_surrogates(&input);
assert!(result.contains("Hello"));
}
#[test]
fn test_leading_unpaired_low_surrogate() {
let mut bytes: Vec<u8> = Vec::new();
bytes.extend_from_slice(&[0xED, 0xB0, 0x80]); bytes.extend_from_slice(b"Hello");
let input = String::from_utf8_lossy(&bytes).into_owned();
let result = sanitize_surrogates(&input);
assert!(result.contains("Hello"));
}
}