use std::{intrinsics::unlikely, mem};
use crate::values::types::string::simd::{SwitchHaveSimd, Vector};
#[inline(always)]
unsafe fn chunk_non_ascii_or_need_escape<V: Vector>(chunk: V) -> bool {
#[allow(clippy::many_single_char_names)]
unsafe fn or4<V: Vector>(a: V, b: V, c: V, d: V) -> V {
let ab = V::or(a, b);
let cd = V::or(c, d);
V::or(ab, cd)
}
let any_control_or_non_ascii = chunk.cmplt(V::splat(32));
let any_7f = chunk.cmpeq(V::splat(0x7f));
let any_double_quote = chunk.cmpeq(V::splat(b'"'));
let any_backslash = chunk.cmpeq(V::splat(b'\\'));
let need_escape = or4(
any_control_or_non_ascii,
any_7f,
any_double_quote,
any_backslash,
);
need_escape.movemask() != 0
}
#[inline(always)]
fn push_escape(to_escape: char, buffer: &mut String) {
use std::fmt::Write;
match to_escape {
'\n' => buffer.push_str("\\n"),
'\r' => buffer.push_str("\\r"),
'\t' => buffer.push_str("\\t"),
'\\' => buffer.push_str("\\\\"),
'"' => buffer.push_str("\\\""),
c if (c as u32) < 0x100 => write!(buffer, "\\x{:02x}", c as u32).unwrap(),
c if (c as u32) < 0x10000 => write!(buffer, "\\u{:04x}", c as u32).unwrap(),
c => write!(buffer, "\\U{:08x}", c as u32).unwrap(),
}
}
#[inline(always)]
fn need_escape(c: char) -> bool {
match c {
c if (c as u32) < 0x20 => true,
'"' => true,
'\\' => true,
c if (c as u32) < 0x7f => false,
c if (c as u32) <= 0xff => true,
c => !c.is_alphanumeric(),
}
}
pub(crate) fn string_repr(str: &str, buffer: &mut String) {
fn loop_unicode(val: &str, buffer: &mut String) {
for x in val.chars() {
if need_escape(x) {
push_escape(x, buffer);
} else {
buffer.push(x);
}
}
}
fn loop_ascii(val: &str, buffer: &mut String) {
for (done, x) in val.as_bytes().iter().enumerate() {
let x = *x;
if unlikely(x >= 0x7f) {
loop_unicode(&val[done..], buffer);
return;
}
if unlikely(need_escape(x as char)) {
push_escape(x as char, buffer);
} else {
let byte_buffer = unsafe { buffer.as_mut_vec() };
byte_buffer.push(x);
}
}
}
#[inline(always)]
unsafe fn loop_ascii_simd<V: Vector>(val: &str, buffer: &mut String) {
debug_assert!(buffer.capacity() - buffer.len() >= val.len() + 1);
if val.len() < mem::size_of::<V>() {
return loop_ascii(val, buffer);
}
#[inline(always)]
unsafe fn push_vec_tail<V: Vector>(buffer: &mut String, vector: V, tail_len: usize) {
debug_assert!(tail_len > 0);
debug_assert!(tail_len <= mem::size_of::<V>());
let new_buffer_len = buffer.len() + tail_len;
debug_assert!(new_buffer_len >= mem::size_of::<V>());
debug_assert!(new_buffer_len <= buffer.capacity());
vector.store_unaligned(
buffer
.as_bytes_mut()
.as_mut_ptr()
.add(new_buffer_len)
.sub(mem::size_of::<V>()),
);
buffer.as_mut_vec().set_len(new_buffer_len);
}
let mut val_offset = 0;
debug_assert!(val.len() >= mem::size_of::<V>());
while val_offset + mem::size_of::<V>() <= val.len() {
let chunk = V::load_unaligned(val.as_ptr().add(val_offset) as *const _);
if chunk_non_ascii_or_need_escape(chunk) {
return loop_ascii(&val[val_offset..], buffer);
}
push_vec_tail(buffer, chunk, mem::size_of::<V>());
val_offset += mem::size_of::<V>();
}
debug_assert!(val_offset >= mem::size_of::<V>());
debug_assert!(val_offset + mem::size_of::<V>() > val.len());
debug_assert!(val_offset % mem::size_of::<V>() == 0);
if val_offset < val.len() {
let chunk_len = val.len() - val_offset;
debug_assert!(chunk_len > 0);
debug_assert!(chunk_len < mem::size_of::<V>());
debug_assert!(val.len() >= mem::size_of::<V>());
let chunk = V::load_unaligned(val.as_ptr().add(val.len()).sub(mem::size_of::<V>()));
if chunk_non_ascii_or_need_escape(chunk) {
return loop_ascii(&val[val_offset..], buffer);
}
push_vec_tail(buffer, chunk, chunk_len);
}
}
struct Switch<'a> {
s: &'a str,
buffer: &'a mut String,
}
impl<'a> SwitchHaveSimd<()> for Switch<'a> {
fn no_simd(self) {
loop_ascii(self.s, self.buffer)
}
fn simd<V: Vector>(self) {
unsafe { loop_ascii_simd::<V>(self.s, self.buffer) }
}
}
buffer.reserve(2 + str.len());
buffer.push('"');
Switch { s: str, buffer }.switch();
buffer.push('"');
}
#[cfg(test)]
mod tests {
use std::mem;
use crate::{
assert,
values::types::string::repr::{chunk_non_ascii_or_need_escape, string_repr},
};
#[test]
fn test_to_repr() {
assert::all_true(
r#"
"\"\\t\\n'\\\"\"" == repr("\t\n'\"")
"\"Hello, 世界\"" == repr("Hello, 世界")
"#,
);
}
#[test]
fn test_string_repr() {
fn test(expected: &str, input: &str) {
let mut repr = String::new();
string_repr(input, &mut repr);
assert_eq!(expected, &repr);
}
test(r#""\x12""#, "\x12");
test(r#""\x7f""#, "\x7f");
test(r#""\n""#, "\n");
test(r#""'""#, "'");
test(r#""\"""#, "\"");
test(r#""\\""#, "\\");
test(r#""\u200b""#, "\u{200b}");
test(r#""Hello, 世界""#, "Hello, 世界");
test(r#""\U0010ffff""#, "\u{10ffff}");
}
#[test]
fn test_to_repr_long_smoke() {
assert::all_true(
r#"
'"0123456789abcdef"' == repr("0123456789abcdef")
'"0123456789\\nbcdef"' == repr("0123456789\nbcdef")
'"Мы, оглядываясь, видим лишь руины"' == repr("Мы, оглядываясь, видим лишь руины")
"#,
)
}
fn string_repr_for_test(s: &str) -> String {
let mut r = String::new();
string_repr(s, &mut r);
r
}
#[test]
fn to_repr_sse() {
for i in 0..0x80 {
let s = String::from_utf8((0..33).map(|_| i as u8).collect()).unwrap();
string_repr_for_test(&s);
}
}
#[test]
fn to_repr_no_escape_all_lengths() {
for len in 0..100 {
let s = String::from_utf8((0..len).map(|i| b'0' + (i % 10)).collect()).unwrap();
assert_eq!(format!("\"{}\"", s), string_repr_for_test(&s));
}
}
#[test]
fn to_repr_tail_escape_all_lengths() {
for len in 0..100 {
let s = String::from_utf8((0..len).map(|i| b'0' + (i % 10)).collect()).unwrap();
assert_eq!(
format!("\"{}\\n\"", s),
string_repr_for_test(&format!("{}\n", s))
);
}
}
#[test]
fn to_repr_middle_escape_all_lengths() {
for len in 0..100 {
let s = String::from_utf8((0..len).map(|i| b'0' + (i % 10)).collect()).unwrap();
assert_eq!(
format!("\"{}\\n{}\"", s, s),
string_repr_for_test(&format!("{}\n{}", s, s))
);
}
}
#[cfg(target_feature = "sse2")]
#[test]
fn test_chunk_non_ascii_or_need_escape() {
use std::arch::x86_64::*;
use crate::values::types::string::simd::Vector;
unsafe fn load(s: &str) -> __m128i {
assert_eq!(s.len(), mem::size_of::<__m128i>());
<__m128i as Vector>::load_unaligned(s.as_ptr())
}
unsafe {
assert!(!chunk_non_ascii_or_need_escape(load("0123456789abcdef")));
assert!(!chunk_non_ascii_or_need_escape(load("0123456789abcde ")));
assert!(chunk_non_ascii_or_need_escape(load("0123456789ab\x19def")));
assert!(chunk_non_ascii_or_need_escape(load("0123456789abcde\n")));
assert!(chunk_non_ascii_or_need_escape(load("0123456789ab\x7fdef")));
assert!(chunk_non_ascii_or_need_escape(load(
"0123\x0456789ab\x02def"
)));
assert!(!chunk_non_ascii_or_need_escape(load("'123456789abcdef")));
assert!(chunk_non_ascii_or_need_escape(load("0\"23456789abcdef")));
assert!(chunk_non_ascii_or_need_escape(load("0123456 Я bcdef")));
}
}
}