#[cfg(feature = "alloc")]
use crate::DecodeUtf8Error;
use crate::{
InvalidEscapeError, InvalidHexError, LoneSurrogateError, UnescapeError, UnescapeErrorKind,
};
use core::{
fmt::{self, Write as _},
iter::FusedIterator,
};
use memchr::memchr;
#[cfg(feature = "alloc")]
use alloc::{borrow::Cow, string::String, vec::Vec};
#[inline]
pub fn escape_str(s: &str) -> EscapeTokens<'_> {
EscapeTokens {
bytes: s.as_bytes(),
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum EscapedToken<'a> {
Literal(&'a str),
Escaped(&'static str),
}
impl<'a> EscapedToken<'a> {
#[inline(always)]
pub(crate) fn as_str(&self) -> &'a str {
match self {
EscapedToken::Literal(s) => s,
EscapedToken::Escaped(s) => s,
}
}
}
impl fmt::Display for EscapedToken<'_> {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(self.as_str())
}
}
#[derive(Clone, Debug)]
#[must_use = "iterators are lazy and do nothing unless consumed"]
pub struct EscapeTokens<'a> {
pub(crate) bytes: &'a [u8],
}
impl<'a> EscapeTokens<'a> {
#[inline]
pub const fn new(s: &'a str) -> Self {
Self {
bytes: s.as_bytes(),
}
}
#[inline(always)]
pub(crate) fn escape(byte: u8) -> Option<&'static str> {
ESCAPE_TABLE[byte as usize]
}
#[inline(always)]
pub(crate) unsafe fn split_at_escape(bytes: &[u8]) -> (&str, &[u8]) {
let pos = match Self::find_escape_char(bytes) {
Some(p) => p,
None => bytes.len(),
};
let (literal_bytes, rest) = bytes.split_at(pos);
(
unsafe { std::str::from_utf8_unchecked(literal_bytes) },
rest,
)
}
#[doc(hidden)]
#[cfg(all(feature = "simd", nightly))]
#[inline]
pub fn find_escape_char(bytes: &[u8]) -> Option<usize> {
use std::simd::{Simd, prelude::SimdPartialEq, prelude::SimdPartialOrd};
const LANES: usize = 16; let mut i = 0;
while i + LANES <= bytes.len() {
let chunk = Simd::<u8, LANES>::from_slice(&bytes[i..]);
let space_v = Simd::splat(b' ' - 1); let quote_v = Simd::splat(b'"');
let slash_v = Simd::splat(b'\\');
let lt_space_mask = chunk.simd_le(space_v);
let eq_quote_mask = chunk.simd_eq(quote_v);
let eq_slash_mask = chunk.simd_eq(slash_v);
let combined_mask = lt_space_mask | eq_quote_mask | eq_slash_mask;
if combined_mask.any() {
let first_match_index = combined_mask.to_bitmask().trailing_zeros() as usize;
return Some(i + first_match_index);
}
i += LANES;
}
if i < bytes.len() {
if let Some(pos) = bytes[i..]
.iter()
.position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
{
return Some(i + pos);
}
}
None
}
#[doc(hidden)]
#[cfg(all(feature = "simd", not(nightly), target_arch = "x86_64"))]
#[inline]
pub fn find_escape_char(bytes: &[u8]) -> Option<usize> {
use std::arch::x86_64::*;
let mut i = 0;
const LANES: usize = 16;
#[target_feature(enable = "sse2")]
unsafe fn find_in_chunk(bytes: &[u8], i: usize) -> Option<usize> {
debug_assert!(
i + LANES <= bytes.len(),
"find_in_chunk: attempted to load past end of slice"
);
let chunk = unsafe { _mm_loadu_si128(bytes.as_ptr().add(i) as *const _) };
let quote_v = _mm_set1_epi8(b'"' as i8);
let slash_v = _mm_set1_epi8(b'\\' as i8);
let bias = _mm_set1_epi8(0x80u8 as i8);
let space_v = _mm_set1_epi8(b' ' as i8);
let biased_chunk = _mm_xor_si128(chunk, bias);
let biased_space_v = _mm_xor_si128(space_v, bias);
let lt_space_mask = _mm_cmplt_epi8(biased_chunk, biased_space_v);
let eq_quote_mask = _mm_cmpeq_epi8(chunk, quote_v);
let eq_slash_mask = _mm_cmpeq_epi8(chunk, slash_v);
let combined_mask =
_mm_or_si128(lt_space_mask, _mm_or_si128(eq_quote_mask, eq_slash_mask));
let mask = _mm_movemask_epi8(combined_mask);
if mask != 0 {
Some(i + mask.trailing_zeros() as usize)
} else {
None
}
}
if cfg!(target_feature = "sse2") {
while i + LANES <= bytes.len() {
if let Some(result) = unsafe { find_in_chunk(bytes, i) } {
return Some(result);
}
i += LANES;
}
} else {
}
if i < bytes.len() {
if let Some(pos) = bytes[i..]
.iter()
.position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
{
return Some(i + pos);
}
}
None
}
#[doc(hidden)]
#[cfg(not(feature = "simd"))]
#[inline]
pub fn find_escape_char(bytes: &[u8]) -> Option<usize> {
use core::mem::size_of;
const WORD_SIZE: usize = size_of::<usize>();
const THRESH: u8 = 0x20;
const fn repeat(b: u8) -> usize {
let mut m: usize = 0;
let mut i = 0;
while i < WORD_SIZE {
m = (m << 8) | (b as usize);
i += 1;
}
m
}
const ONE_MASK: usize = repeat(0x01);
const MSB_MASK: usize = repeat(0x80);
const QUOTE_MASK: usize = repeat(b'"');
const SLASH_MASK: usize = repeat(b'\\');
const THR_MASK: usize = repeat(THRESH);
let mut i = 0usize;
while i + WORD_SIZE <= bytes.len() {
let word = unsafe { (bytes.as_ptr().add(i) as *const usize).read_unaligned() };
let xq = word ^ QUOTE_MASK;
let quote_bits = (xq.wrapping_sub(ONE_MASK) & !xq) & MSB_MASK;
let xs = word ^ SLASH_MASK;
let slash_bits = (xs.wrapping_sub(ONE_MASK) & !xs) & MSB_MASK;
let control_bits = (word.wrapping_sub(THR_MASK) & !word) & MSB_MASK;
let combined = quote_bits | slash_bits | control_bits;
if combined != 0 {
let byte_index = if cfg!(target_endian = "little") {
(combined.trailing_zeros() as usize) / 8
} else {
(combined.leading_zeros() as usize) / 8
};
return Some(i + byte_index);
}
i += WORD_SIZE;
}
if i < bytes.len() {
if let Some(pos) = bytes[i..]
.iter()
.position(|&b| ESCAPE_DECISION_TABLE[b as usize] != 0)
{
return Some(i + pos);
}
}
None
}
#[cfg(all(feature = "simd", not(nightly), not(target_arch = "x86_64")))]
compile_error! { "simd requires nightly or target_arch = \"x86_64\"" }
}
impl<'a> Iterator for EscapeTokens<'a> {
type Item = EscapedToken<'a>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.bytes.is_empty() {
return None;
}
if let Some(escaped) = Self::escape(self.bytes[0]) {
self.bytes = &self.bytes[1..];
Some(EscapedToken::Escaped(escaped))
} else {
let (literal, rest) = unsafe { Self::split_at_escape(self.bytes) };
self.bytes = rest;
Some(EscapedToken::Literal(literal))
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
if self.bytes.is_empty() {
(0, Some(0))
} else {
(1, Some(self.bytes.len()))
}
}
}
impl<'a> FusedIterator for EscapeTokens<'a> {}
impl fmt::Display for EscapeTokens<'_> {
#[inline]
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for token in self.clone() {
f.write_str(token.as_str())?;
}
Ok(())
}
}
#[cfg(feature = "alloc")]
impl<'a> From<EscapeTokens<'a>> for Cow<'a, str> {
fn from(mut iter: EscapeTokens<'a>) -> Self {
match iter.next() {
None => Cow::Borrowed(""),
Some(EscapedToken::Literal(s)) if iter.bytes.is_empty() => {
Cow::Borrowed(s)
}
Some(first) => {
let mut s = String::with_capacity(first.as_str().len() + iter.bytes.len());
s.push_str(first.as_str());
s.extend(iter);
Cow::Owned(s)
}
}
}
}
#[inline]
pub fn unescape<I: AsRef<[u8]> + ?Sized>(input: &I) -> UnescapeTokens<'_> {
UnescapeTokens {
bytes: input.as_ref(),
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum UnescapedToken<'a> {
Literal(&'a [u8]),
Unescaped(char),
}
impl UnescapedToken<'_> {
pub fn display_utf8(&self) -> DisplayUnescapedToken<'_> {
DisplayUnescapedToken {
token: self,
lossy: true,
}
}
pub fn display_utf8_lossy(&self) -> DisplayUnescapedToken<'_> {
DisplayUnescapedToken {
token: self,
lossy: true,
}
}
#[inline(always)]
const fn len(&self) -> usize {
match self {
UnescapedToken::Literal(literal) => literal.len(),
UnescapedToken::Unescaped(ch) => ch.len_utf8(),
}
}
}
pub struct DisplayUnescapedToken<'a> {
token: &'a UnescapedToken<'a>,
lossy: bool,
}
impl fmt::Display for DisplayUnescapedToken<'_> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self.token {
UnescapedToken::Literal(bytes) => crate::display_bytes_utf8(bytes, f, self.lossy),
UnescapedToken::Unescaped(c) => f.write_char(*c),
}
}
}
#[derive(Clone, Debug)]
#[must_use = "iterators are lazy and do nothing unless consumed"]
pub struct UnescapeTokens<'a> {
bytes: &'a [u8],
}
impl<'a> UnescapeTokens<'a> {
#[inline]
pub const fn new(bytes: &'a [u8]) -> Self {
Self { bytes }
}
#[inline]
pub const fn remnant(&self) -> &'a [u8] {
self.bytes
}
#[cfg(feature = "alloc")]
pub fn decode_utf8(self) -> Result<Cow<'a, str>, DecodeUtf8Error> {
match self.try_into().map_err(DecodeUtf8Error::Unescape)? {
Cow::Borrowed(bytes) => str::from_utf8(bytes)
.map(Cow::Borrowed)
.map_err(DecodeUtf8Error::Utf8),
Cow::Owned(bytes) => String::from_utf8(bytes)
.map(Cow::Owned)
.map_err(|e| DecodeUtf8Error::Utf8(e.utf8_error())),
}
}
#[cfg(feature = "alloc")]
pub fn decode_utf8_lossy(self) -> Result<Cow<'a, str>, UnescapeError> {
use crate::decode_utf8_lossy;
Ok(decode_utf8_lossy(self.try_into()?))
}
pub fn display_utf8(self) -> DisplayUnescapeTokens<'a> {
DisplayUnescapeTokens {
inner: self,
lossy: false,
}
}
pub fn display_utf8_lossy(self) -> DisplayUnescapeTokens<'a> {
DisplayUnescapeTokens {
inner: self,
lossy: true,
}
}
#[inline(always)]
pub(crate) fn split_at_escape(bytes: &'a [u8]) -> (&'a [u8], &'a [u8]) {
let pos = match memchr(b'\\', bytes) {
Some(p) => p,
None => bytes.len(),
};
let (literal, rest) = bytes.split_at(pos);
(literal, rest)
}
#[inline(always)]
pub(crate) fn handle_escape(bytes: &mut &'a [u8]) -> Result<char, UnescapeError> {
match bytes.first() {
Some(b'u') => {
*bytes = &bytes[1..];
Self::handle_unicode_escape(bytes)
}
Some(&byte) => {
match UNESCAPE_TABLE[byte as usize] {
Some(c) => {
*bytes = &bytes[1..];
Ok(c)
}
None => {
Err(UnescapeError {
kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError {
found: byte,
}),
offset: 1,
})
}
}
}
None => {
Err(UnescapeError {
kind: UnescapeErrorKind::UnexpectedEof,
offset: 1,
})
}
}
}
#[inline(always)]
fn handle_unicode_escape(bytes: &mut &'a [u8]) -> Result<char, UnescapeError> {
let first = Self::parse_hex4(bytes, 2)?;
*bytes = &bytes[4..];
if (0xD800..=0xDBFF).contains(&first) {
#[allow(clippy::get_first)]
match (bytes.get(0), bytes.get(1)) {
(Some(b'\\'), Some(b'u')) => {
match Self::parse_hex4(&bytes[2..], 8) {
Ok(low) if (0xDC00..=0xDFFF).contains(&low) => {
let high_t = first as u32;
let low_t = low as u32;
let code = 0x10000 + (((high_t - 0xD800) << 10) | (low_t - 0xDC00));
let result_char = char::from_u32(code).expect(
"valid surrogate pair math should always produce a valid char",
);
*bytes = &bytes[6..];
return Ok(result_char);
}
Ok(_) => {
return Err(UnescapeError {
kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError {
surrogate: first,
}),
offset: 6,
});
}
Err(err) => {
return Err(err);
}
}
}
(Some(b'\\'), None) => {
return Err(UnescapeError {
kind: UnescapeErrorKind::UnexpectedEof,
offset: 7,
});
}
(None, None) => {
return Err(UnescapeError {
kind: UnescapeErrorKind::UnexpectedEof,
offset: 6,
});
}
_ => {
return Err(UnescapeError {
kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError {
surrogate: first,
}),
offset: 6,
});
}
}
}
match char::from_u32(first as u32) {
Some(c) => Ok(c),
None => {
Err(UnescapeError {
kind: UnescapeErrorKind::LoneSurrogate(LoneSurrogateError { surrogate: first }),
offset: 6,
})
}
}
}
#[inline(always)]
fn parse_hex4(slice: &[u8], base_offset: u8) -> Result<u16, UnescapeError> {
if let Some(chunk) = slice.get(..4) {
let b0 = chunk[0];
let b1 = chunk[1];
let b2 = chunk[2];
let b3 = chunk[3];
if let (Some(v0), Some(v1), Some(v2), Some(v3)) = (
HEX[b0 as usize],
HEX[b1 as usize],
HEX[b2 as usize],
HEX[b3 as usize],
) {
let result = (v0 as u16) << 12 | (v1 as u16) << 8 | (v2 as u16) << 4 | (v3 as u16);
return Ok(result);
}
}
#[cold]
fn handle_error(slice: &[u8], base_offset: u8) -> UnescapeError {
for (i, &b) in slice.iter().enumerate() {
if HEX[b as usize].is_none() {
return UnescapeError {
kind: UnescapeErrorKind::InvalidHex(InvalidHexError { found: b }),
offset: base_offset + i as u8,
};
}
}
UnescapeError {
kind: UnescapeErrorKind::UnexpectedEof,
offset: base_offset + slice.len() as u8,
}
}
Err(handle_error(slice, base_offset))
}
}
impl<'a> Iterator for UnescapeTokens<'a> {
type Item = Result<UnescapedToken<'a>, UnescapeError>;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.bytes.is_empty() {
return None;
}
if self.bytes[0] == b'\\' {
Some({
let mut remainder = &self.bytes[1..];
match UnescapeTokens::handle_escape(&mut remainder) {
Ok(unescaped_char) => {
self.bytes = remainder;
Ok(UnescapedToken::Unescaped(unescaped_char))
}
Err(err) => Err(err),
}
})
} else {
let (literal, rest) = Self::split_at_escape(self.bytes);
self.bytes = rest;
Some(Ok(UnescapedToken::Literal(literal)))
}
}
fn size_hint(&self) -> (usize, Option<usize>) {
if self.bytes.is_empty() {
(0, Some(0))
} else {
(
self.bytes.len().saturating_add(1) / 6,
Some(self.bytes.len()),
)
}
}
}
impl<'a> FusedIterator for UnescapeTokens<'a> {}
#[cfg(feature = "alloc")]
impl<'a> TryFrom<UnescapeTokens<'a>> for Cow<'a, [u8]> {
type Error = UnescapeError;
fn try_from(mut value: UnescapeTokens<'a>) -> Result<Self, Self::Error> {
match value.next() {
None => Ok(Cow::Borrowed(b"")),
Some(Ok(UnescapedToken::Literal(literal))) if value.bytes.is_empty() => {
Ok(Cow::Borrowed(literal))
}
Some(Ok(first_token)) => {
let mut buf = Vec::with_capacity(first_token.len() + value.bytes.len());
let process_token = |buf: &mut Vec<u8>, token: UnescapedToken| match token {
UnescapedToken::Literal(bytes) => buf.extend_from_slice(bytes),
UnescapedToken::Unescaped(c) => {
append_char(buf, c);
}
};
process_token(&mut buf, first_token);
for item in value {
process_token(&mut buf, item?);
}
Ok(Cow::Owned(buf))
}
Some(Err(e)) => Err(e),
}
}
}
pub struct DisplayUnescapeTokens<'a> {
inner: UnescapeTokens<'a>,
lossy: bool,
}
impl<'a> fmt::Display for DisplayUnescapeTokens<'a> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for chunk_result in self.inner.clone() {
match chunk_result {
Ok(token) => {
let display_chunk = DisplayUnescapedToken {
token: &token,
lossy: self.lossy,
};
write!(f, "{}", display_chunk)?;
}
Err(_) => return Err(fmt::Error), }
}
Ok(())
}
}
const ESCAPE_TABLE: [Option<&'static str>; 256] = {
let mut table: [Option<&'static str>; 256] = [None; 256];
table[b'"' as usize] = Some(r#"\""#);
table[b'\\' as usize] = Some(r#"\\"#);
table[0x08] = Some(r#"\b"#); table[0x09] = Some(r#"\t"#); table[0x0A] = Some(r#"\n"#); table[0x0C] = Some(r#"\f"#); table[0x0D] = Some(r#"\r"#);
table[0x00] = Some(r#"\u0000"#);
table[0x01] = Some(r#"\u0001"#);
table[0x02] = Some(r#"\u0002"#);
table[0x03] = Some(r#"\u0003"#);
table[0x04] = Some(r#"\u0004"#);
table[0x05] = Some(r#"\u0005"#);
table[0x06] = Some(r#"\u0006"#);
table[0x07] = Some(r#"\u0007"#);
table[0x0B] = Some(r#"\u000b"#);
table[0x0E] = Some(r#"\u000e"#);
table[0x0F] = Some(r#"\u000f"#);
table[0x10] = Some(r#"\u0010"#);
table[0x11] = Some(r#"\u0011"#);
table[0x12] = Some(r#"\u0012"#);
table[0x13] = Some(r#"\u0013"#);
table[0x14] = Some(r#"\u0014"#);
table[0x15] = Some(r#"\u0015"#);
table[0x16] = Some(r#"\u0016"#);
table[0x17] = Some(r#"\u0017"#);
table[0x18] = Some(r#"\u0018"#);
table[0x19] = Some(r#"\u0019"#);
table[0x1A] = Some(r#"\u001a"#);
table[0x1B] = Some(r#"\u001b"#);
table[0x1C] = Some(r#"\u001c"#);
table[0x1D] = Some(r#"\u001d"#);
table[0x1E] = Some(r#"\u001e"#);
table[0x1F] = Some(r#"\u001f"#);
table
};
#[doc(hidden)]
#[allow(unused)]
pub const ESCAPE_DECISION_TABLE: [u8; 256] = {
let mut table = [0u8; 256];
let mut i = 0;
while i < 256 {
if ESCAPE_TABLE[i].is_some() {
table[i] = 1;
}
i += 1;
}
table
};
const UNESCAPE_TABLE: [Option<char>; 256] = {
let mut tbl: [Option<char>; 256] = [None; 256];
tbl[b'"' as usize] = Some('\"');
tbl[b'\\' as usize] = Some('\\');
tbl[b'/' as usize] = Some('/');
tbl[b'b' as usize] = Some('\x08');
tbl[b'f' as usize] = Some('\x0C');
tbl[b'n' as usize] = Some('\n');
tbl[b'r' as usize] = Some('\r');
tbl[b't' as usize] = Some('\t');
tbl
};
const HEX: [Option<u8>; 256] = {
let mut table = [None; 256];
let mut i = 0;
while i < 256 {
table[i] = match i as u8 {
b'0'..=b'9' => Some(i as u8 - b'0'),
b'a'..=b'f' => Some(i as u8 - b'a' + 10),
b'A'..=b'F' => Some(i as u8 - b'A' + 10),
_ => None,
};
i += 1;
}
table
};
#[inline]
pub(crate) fn append_char(buf: &mut Vec<u8>, c: char) {
let char_len = c.len_utf8();
let old_len = buf.len();
buf.resize(old_len + char_len, 0);
c.encode_utf8(&mut buf[old_len..]);
}
#[cfg(feature = "alloc")]
mod iter_traits {
use super::{EscapedToken, UnescapedToken, append_char};
use alloc::string::String;
use alloc::vec::Vec;
impl<'a> FromIterator<EscapedToken<'a>> for String {
#[inline]
fn from_iter<I: IntoIterator<Item = EscapedToken<'a>>>(iter: I) -> String {
let mut s = String::new();
s.extend(iter);
s
}
}
impl<'a> Extend<EscapedToken<'a>> for String {
#[inline]
fn extend<I: IntoIterator<Item = EscapedToken<'a>>>(&mut self, iter: I) {
iter.into_iter().for_each(move |token| {
self.push_str(token.as_str());
});
}
}
impl<'a> FromIterator<UnescapedToken<'a>> for Vec<u8> {
#[inline]
fn from_iter<I: IntoIterator<Item = UnescapedToken<'a>>>(iter: I) -> Vec<u8> {
let mut buf = Vec::new();
buf.extend(iter);
buf
}
}
impl<'a> Extend<UnescapedToken<'a>> for Vec<u8> {
#[inline]
fn extend<I: IntoIterator<Item = UnescapedToken<'a>>>(&mut self, iter: I) {
iter.into_iter().for_each(move |token| match token {
UnescapedToken::Literal(literal) => self.extend_from_slice(literal),
UnescapedToken::Unescaped(ch) => append_char(self, ch),
})
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_string() {
let mut iter = UnescapeTokens::new(b"");
assert_eq!(iter.next(), None);
}
#[test]
fn test_pure_literal() {
let mut iter = UnescapeTokens::new(b"hello world");
assert_eq!(
iter.next(),
Some(Ok(UnescapedToken::Literal(b"hello world")))
);
assert_eq!(iter.next(), None);
}
#[test]
fn test_simple_escapes() {
let mut iter = UnescapeTokens::new(b"a\\nb\\tc");
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"a"))));
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('\n'))));
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"b"))));
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('\t'))));
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"c"))));
assert_eq!(iter.next(), None);
}
#[test]
fn test_starts_with_escape() {
let mut iter = UnescapeTokens::new(b"\\nhello");
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('\n'))));
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"hello"))));
assert_eq!(iter.next(), None);
}
#[test]
fn test_ends_with_escape() {
let mut iter = UnescapeTokens::new(b"hello\\n");
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"hello"))));
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('\n'))));
assert_eq!(iter.next(), None);
}
#[test]
fn test_unicode_and_surrogate() {
let mut iter = UnescapeTokens::new(b"A is \\u0041, smiley is \\uD83D\\uDE00!");
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"A is "))));
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('A'))));
assert_eq!(
iter.next(),
Some(Ok(UnescapedToken::Literal(b", smiley is ")))
);
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Unescaped('😀'))));
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"!"))));
assert_eq!(iter.next(), None);
}
#[test]
fn test_invalid_escape_yields_literal_first() {
let mut iter = UnescapeTokens::new(b"ValidPart\\zInvalid");
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"ValidPart"))));
let err = iter.next().unwrap().unwrap_err();
assert_eq!(
err,
UnescapeError {
kind: UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'z' }),
offset: 1,
}
);
assert_eq!(iter.remnant(), b"\\zInvalid");
assert_eq!(iter.next(), Some(Err(err)));
}
#[test]
fn test_sticky_error_behavior() {
let mut iter = UnescapeTokens::new(b"a\\zb");
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"a"))));
let err1 = iter.next().unwrap().unwrap_err();
assert_eq!(
err1.kind,
UnescapeErrorKind::InvalidEscape(InvalidEscapeError { found: b'z' })
);
assert_eq!(iter.remnant(), b"\\zb");
let err2 = iter.next().unwrap().unwrap_err();
assert_eq!(err1, err2);
assert_eq!(iter.remnant(), b"\\zb"); }
#[test]
fn test_incomplete_escape_at_end() {
let mut iter = UnescapeTokens::new(b"ValidPart\\u12");
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"ValidPart"))));
assert_eq!(iter.remnant(), b"\\u12");
let err = iter.next().unwrap().unwrap_err();
assert_eq!(
err,
UnescapeError {
kind: UnescapeErrorKind::UnexpectedEof,
offset: 4,
}
);
assert_eq!(iter.remnant(), b"\\u12");
assert_eq!(iter.next(), Some(Err(err)));
}
#[test]
fn test_dangling_backslash() {
let mut iter = UnescapeTokens::new(b"end with \\");
assert_eq!(iter.next(), Some(Ok(UnescapedToken::Literal(b"end with "))));
let err = iter.next().unwrap().unwrap_err();
assert_eq!(
err,
UnescapeError {
kind: UnescapeErrorKind::UnexpectedEof,
offset: 1,
}
);
assert_eq!(iter.next(), Some(Err(err)));
}
#[test]
fn test_display_unescape_tokens() {
let iter = UnescapeTokens::new(b"hello \\u0041\\nworld");
let display = iter.display_utf8();
assert_eq!(alloc::format!("{}", display), "hello A\nworld");
}
#[test]
fn test_display_unescape_error() {
let iter = UnescapeTokens::new(b"hello\\z");
let mut out = String::new();
write!(out, "{}", iter.display_utf8_lossy()).unwrap_err();
assert!(out.starts_with("hello"));
}
#[test]
fn test_escape_no_escapes() {
let mut iter = EscapeTokens::new("hello world");
assert_eq!(iter.next(), Some(EscapedToken::Literal("hello world")));
assert_eq!(iter.next(), None);
}
#[test]
fn test_escape_simple() {
let mut iter = EscapeTokens::new("hello\nworld");
assert_eq!(iter.next(), Some(EscapedToken::Literal("hello")));
assert_eq!(iter.next(), Some(EscapedToken::Escaped(r#"\n"#)));
assert_eq!(iter.next(), Some(EscapedToken::Literal("world")));
assert_eq!(iter.next(), None);
}
#[test]
fn test_display_escape_tokens() {
let iter = EscapeTokens::new("a\"b\tc");
assert_eq!(alloc::format!("{}", iter), r#"a\"b\tc"#);
}
#[cfg(feature = "alloc")]
#[test]
fn test_escape_to_cow_borrowed() {
let iter = EscapeTokens::new("no escapes here");
let cow: Cow<'_, str> = iter.into();
assert!(matches!(cow, Cow::Borrowed(_)));
assert_eq!(cow, "no escapes here");
}
#[cfg(feature = "alloc")]
#[test]
fn test_escape_to_cow_owned() {
let iter = EscapeTokens::new("has\n an escape");
let cow: Cow<'_, str> = iter.into();
assert!(matches!(cow, Cow::Owned(_)));
assert_eq!(cow, r#"has\n an escape"#);
}
#[cfg(feature = "alloc")]
#[test]
fn test_unescape_to_cow_borrowed() {
let iter = UnescapeTokens::new(b"no escapes here");
let cow: Cow<'_, [u8]> = iter.try_into().unwrap();
assert!(matches!(cow, Cow::Borrowed(_)));
assert_eq!(*cow, *b"no escapes here");
}
#[cfg(feature = "alloc")]
#[test]
fn test_unescape_to_cow_owned() {
let iter = UnescapeTokens::new(b"has\\n an escape");
let cow: Cow<'_, [u8]> = iter.try_into().unwrap();
assert!(matches!(cow, Cow::Owned(_)));
assert_eq!(*cow, *b"has\n an escape");
}
}
#[cfg(test)]
mod find_escape_char_tests {
use std::format;
use super::{ESCAPE_DECISION_TABLE, EscapeTokens};
fn run_test(input: &str, expected: Option<usize>, case_name: &str) {
let result = EscapeTokens::find_escape_char(input.as_bytes());
assert_eq!(result, expected, "Failed test case: '{}'", case_name);
}
#[test]
fn test_no_escapes() {
run_test("", None, "Empty string");
run_test("Hello, world!", None, "Simple ASCII");
run_test("This string is exactly 16 bytes", None, "16-byte ASCII");
run_test(
"This string is over 16 bytes long now",
None,
"Over 16-byte ASCII",
);
run_test("Hello, éàçüö!", None, "Non-ASCII UTF-8");
run_test("Testing with emojis 😀❤️✅", None, "Emojis");
}
#[test]
fn test_single_escapes() {
run_test("\"", Some(0), "Quote at start");
run_test("Hello \" world", Some(6), "Quote in middle");
run_test("Hello\\", Some(5), "Backslash at end");
run_test("\n", Some(0), "Control char (newline) at start");
run_test("Hello\tworld", Some(5), "Control char (tab) in middle");
run_test(
"Control char at end\u{08}",
Some(19),
"Control char (backspace) at end",
);
}
#[test]
fn test_finds_first_of_multiple() {
run_test("a\"b\\c\nd", Some(1), "Finds first quote");
run_test("ab\\c\"d\ne", Some(2), "Finds first backslash");
run_test("abc\nd\"e\\f", Some(3), "Finds first control char");
run_test("\"\n\\", Some(0), "Multiple escapes at start");
}
#[test]
fn test_simd_chunk_boundaries() {
let s15 = "a".repeat(15);
let s16 = "a".repeat(16);
let s17 = "a".repeat(17);
run_test(&format!("{}\"", s15), Some(15), "Escape at index 15");
run_test(&format!("{}\n", s16), Some(16), "Escape at index 16");
run_test(&format!("{}\t", s17), Some(17), "Escape at index 17");
let long = "a".repeat(40);
run_test(
&format!("{}\\\\", long),
Some(40),
"Escape deep in a long string",
);
}
#[test]
fn test_remainder_logic() {
run_test("short\nstring", Some(5), "Short string with escape");
run_test("no escapes", None, "Short string no escape");
let s16 = "a".repeat(16);
run_test(
&format!("{}\"", s16),
Some(16),
"Escape in 1-byte remainder",
);
let s15 = "b".repeat(15);
run_test(
&format!("{}{}\t", s15, s15),
Some(30),
"Escape at end of 15-byte remainder",
);
}
#[test]
fn test_all_escapable_bytes_individually() {
let prefix = "0123456789abcdef";
for byte_val in 0..=255u8 {
let mut test_bytes = prefix.as_bytes().to_vec();
test_bytes.push(byte_val);
let result = EscapeTokens::find_escape_char(&test_bytes);
let expected_to_escape = ESCAPE_DECISION_TABLE[byte_val as usize] == 1;
if expected_to_escape {
assert_eq!(
result,
Some(16),
"Failed to find required escape for byte 0x{:02X}",
byte_val
);
} else {
assert_eq!(
result, None,
"Incorrectly found an escape for byte 0x{:02X}",
byte_val
);
}
}
}
}