use std::iter::Peekable;
use std::str::Chars;
const RAW_BYTE_SENTINEL_BASE: u32 = 0xE000;
const RAW_BYTE_SENTINEL_MIN: u32 = 0xE080;
const RAW_BYTE_SENTINEL_MAX: u32 = 0xE0FF;
const RAW_BYTE_CHAR_MIN: u32 = 0x3FFF80;
const RAW_BYTE_CHAR_MAX: u32 = 0x3FFFFF;
const UNIBYTE_BYTE_SENTINEL_BASE: u32 = 0xE300;
const UNIBYTE_BYTE_SENTINEL_MIN: u32 = 0xE300;
const UNIBYTE_BYTE_SENTINEL_MAX: u32 = 0xE3FF;
const EXT_SEQ_PREFIX: u32 = 0xE100;
const EXT_SEQ_LEN_BASE: u32 = 0xE110;
const EXT_SEQ_BYTE_BASE: u32 = 0xE200;
const EXT_SEQ_MAX_LEN: u32 = 6;
pub(crate) fn encode_nonunicode_char_for_storage(code: u32) -> Option<String> {
if code <= 0x10FFFF {
return None;
}
if (RAW_BYTE_CHAR_MIN..=RAW_BYTE_CHAR_MAX).contains(&code) {
let raw = code - 0x3FFF00;
let ch = char::from_u32(RAW_BYTE_SENTINEL_BASE + raw).expect("valid raw-byte sentinel");
return Some(ch.to_string());
}
if code <= 0x3FFFFF {
let bytes = encode_emacs_extended_utf8(code);
return Some(encode_extended_sequence_for_storage(&bytes));
}
None
}
fn encode_emacs_extended_utf8(code: u32) -> Vec<u8> {
if code <= 0x7F {
vec![code as u8]
} else if code <= 0x7FF {
vec![0xC0 | ((code >> 6) as u8), 0x80 | ((code & 0x3F) as u8)]
} else if code <= 0xFFFF {
vec![
0xE0 | ((code >> 12) as u8),
0x80 | (((code >> 6) & 0x3F) as u8),
0x80 | ((code & 0x3F) as u8),
]
} else if code <= 0x1FFFFF {
vec![
0xF0 | (((code >> 18) & 0x07) as u8),
0x80 | (((code >> 12) & 0x3F) as u8),
0x80 | (((code >> 6) & 0x3F) as u8),
0x80 | ((code & 0x3F) as u8),
]
} else if code <= 0x3FFFFFF {
vec![
0xF8 | (((code >> 24) & 0x03) as u8),
0x80 | (((code >> 18) & 0x3F) as u8),
0x80 | (((code >> 12) & 0x3F) as u8),
0x80 | (((code >> 6) & 0x3F) as u8),
0x80 | ((code & 0x3F) as u8),
]
} else {
vec![
0xFC | (((code >> 30) & 0x01) as u8),
0x80 | (((code >> 24) & 0x3F) as u8),
0x80 | (((code >> 18) & 0x3F) as u8),
0x80 | (((code >> 12) & 0x3F) as u8),
0x80 | (((code >> 6) & 0x3F) as u8),
0x80 | ((code & 0x3F) as u8),
]
}
}
fn decode_emacs_extended_utf8(bytes: &[u8]) -> Option<u32> {
match bytes {
[b0] => Some(*b0 as u32),
[b0, b1] if (*b0 == 0xC0 || *b0 == 0xC1) && (b1 & 0xC0) == 0x80 => {
Some(0x80 + (((b0 & 0x01) as u32) << 6) + ((b1 & 0x3F) as u32))
}
[b0, b1] if (0xC2..=0xDF).contains(b0) && (b1 & 0xC0) == 0x80 => {
Some((((b0 & 0x1F) as u32) << 6) | ((b1 & 0x3F) as u32))
}
[b0, b1, b2] if (b0 & 0xF0) == 0xE0 && (b1 & 0xC0) == 0x80 && (b2 & 0xC0) == 0x80 => {
Some((((b0 & 0x0F) as u32) << 12) | (((b1 & 0x3F) as u32) << 6) | ((b2 & 0x3F) as u32))
}
[b0, b1, b2, b3]
if (b0 & 0xF8) == 0xF0
&& (b1 & 0xC0) == 0x80
&& (b2 & 0xC0) == 0x80
&& (b3 & 0xC0) == 0x80 =>
{
Some(
(((b0 & 0x07) as u32) << 18)
| (((b1 & 0x3F) as u32) << 12)
| (((b2 & 0x3F) as u32) << 6)
| ((b3 & 0x3F) as u32),
)
}
[b0, b1, b2, b3, b4]
if (b0 & 0xFC) == 0xF8
&& (b1 & 0xC0) == 0x80
&& (b2 & 0xC0) == 0x80
&& (b3 & 0xC0) == 0x80
&& (b4 & 0xC0) == 0x80 =>
{
Some(
(((b0 & 0x03) as u32) << 24)
| (((b1 & 0x3F) as u32) << 18)
| (((b2 & 0x3F) as u32) << 12)
| (((b3 & 0x3F) as u32) << 6)
| ((b4 & 0x3F) as u32),
)
}
[b0, b1, b2, b3, b4, b5]
if (b0 & 0xFE) == 0xFC
&& (b1 & 0xC0) == 0x80
&& (b2 & 0xC0) == 0x80
&& (b3 & 0xC0) == 0x80
&& (b4 & 0xC0) == 0x80
&& (b5 & 0xC0) == 0x80 =>
{
Some(
(((b0 & 0x01) as u32) << 30)
| (((b1 & 0x3F) as u32) << 24)
| (((b2 & 0x3F) as u32) << 18)
| (((b3 & 0x3F) as u32) << 12)
| (((b4 & 0x3F) as u32) << 6)
| ((b5 & 0x3F) as u32),
)
}
_ => None,
}
}
fn encode_extended_sequence_for_storage(bytes: &[u8]) -> String {
let mut out = String::new();
out.push(char::from_u32(EXT_SEQ_PREFIX).expect("valid extended prefix sentinel"));
let len_char = char::from_u32(EXT_SEQ_LEN_BASE + bytes.len() as u32)
.expect("valid extended length sentinel");
out.push(len_char);
for b in bytes {
out.push(
char::from_u32(EXT_SEQ_BYTE_BASE + (*b as u32)).expect("valid extended byte sentinel"),
);
}
out
}
fn decode_extended_sequence_span(s: &str, start: usize) -> Option<(usize, u32)> {
let mut iter = s[start..].char_indices();
let (_, prefix) = iter.next()?;
if prefix as u32 != EXT_SEQ_PREFIX {
return None;
}
let (len_off, len_ch) = iter.next()?;
let len_code = len_ch as u32;
if !(EXT_SEQ_LEN_BASE + 1..=EXT_SEQ_LEN_BASE + EXT_SEQ_MAX_LEN).contains(&len_code) {
return None;
}
let len = (len_code - EXT_SEQ_LEN_BASE) as usize;
let mut bytes = Vec::with_capacity(len);
let mut end_rel = len_off + len_ch.len_utf8();
for _ in 0..len {
let (byte_off, byte_ch) = iter.next()?;
let byte_code = byte_ch as u32;
if !(EXT_SEQ_BYTE_BASE..=EXT_SEQ_BYTE_BASE + 0xFF).contains(&byte_code) {
return None;
}
bytes.push((byte_code - EXT_SEQ_BYTE_BASE) as u8);
end_rel = byte_off + byte_ch.len_utf8();
}
let cp = decode_emacs_extended_utf8(&bytes)?;
Some((start + end_rel, cp))
}
fn push_unibyte_literal_byte(out: &mut Vec<u8>, byte: u8) {
match byte {
b'"' => out.extend_from_slice(br#"\""#),
b'\\' => out.extend_from_slice(br#"\\"#),
b if b >= 0x80 => push_octal_escape(out, b),
b => out.push(b),
}
}
fn scan_storage_units(s: &str) -> Vec<(usize, usize, u32, usize, usize)> {
let mut out = Vec::new();
let mut idx = 0usize;
while idx < s.len() {
let ch = s[idx..].chars().next().expect("valid utf-8 char boundary");
let code = ch as u32;
let next = idx + ch.len_utf8();
if (RAW_BYTE_SENTINEL_MIN..=RAW_BYTE_SENTINEL_MAX).contains(&code) {
let raw = (code - RAW_BYTE_SENTINEL_BASE) as u8;
out.push((idx, next, 0x3FFF00 + raw as u32, 4, 2));
idx = next;
continue;
}
if (UNIBYTE_BYTE_SENTINEL_MIN..=UNIBYTE_BYTE_SENTINEL_MAX).contains(&code) {
let byte = (code - UNIBYTE_BYTE_SENTINEL_BASE) as u8;
out.push((idx, next, byte as u32, 1, 1));
idx = next;
continue;
}
if code == EXT_SEQ_PREFIX {
if let Some((end, cp)) = decode_extended_sequence_span(s, idx) {
let byte_len = ((s[idx..end].chars().nth(1).expect("extended len sentinel") as u32)
- EXT_SEQ_LEN_BASE) as usize;
out.push((idx, end, cp, 1, byte_len));
idx = end;
continue;
}
}
let width = crate::encoding::char_width(ch);
out.push((idx, next, code, width, ch.len_utf8()));
idx = next;
}
out
}
fn storage_has_special_units(s: &str) -> bool {
if s.is_ascii() {
return false;
}
s.chars().any(|ch| {
let code = ch as u32;
(RAW_BYTE_SENTINEL_MIN..=RAW_BYTE_SENTINEL_MAX).contains(&code)
|| (UNIBYTE_BYTE_SENTINEL_MIN..=UNIBYTE_BYTE_SENTINEL_MAX).contains(&code)
|| code == EXT_SEQ_PREFIX
|| (EXT_SEQ_LEN_BASE + 1..=EXT_SEQ_LEN_BASE + EXT_SEQ_MAX_LEN).contains(&code)
|| (EXT_SEQ_BYTE_BASE..=EXT_SEQ_BYTE_BASE + 0xFF).contains(&code)
})
}
fn plain_utf8_char_to_byte(s: &str, char_idx: usize) -> usize {
if s.is_ascii() {
return char_idx.min(s.len());
}
s.char_indices()
.nth(char_idx)
.map(|(byte_idx, _)| byte_idx)
.unwrap_or(s.len())
}
pub(crate) fn bytes_to_storage_string(bytes: &[u8]) -> String {
if let Ok(utf8) = String::from_utf8(bytes.to_vec()) {
return utf8;
}
let mut out = String::new();
let max_chunk = EXT_SEQ_MAX_LEN as usize;
for chunk in bytes.chunks(max_chunk) {
out.push_str(&encode_extended_sequence_for_storage(chunk));
}
out
}
pub(crate) fn bytes_to_unibyte_storage_string(bytes: &[u8]) -> String {
let mut out = String::with_capacity(bytes.len());
for b in bytes {
if *b <= 0x7f {
out.push(char::from(*b));
} else {
out.push(
char::from_u32(UNIBYTE_BYTE_SENTINEL_BASE + (*b as u32))
.expect("valid unibyte-byte sentinel"),
);
}
}
out
}
pub(crate) fn encode_char_code_for_string_storage(code: u32, multibyte: bool) -> Option<String> {
if !multibyte {
return (code <= 0xff).then(|| bytes_to_unibyte_storage_string(&[code as u8]));
}
if let Some(ch) = char::from_u32(code) {
return Some(ch.to_string());
}
encode_nonunicode_char_for_storage(code)
}
pub(crate) fn decode_storage_units(s: &str) -> Vec<(u32, usize)> {
if !storage_has_special_units(s) {
return s
.chars()
.map(|ch| (ch as u32, crate::encoding::char_width(ch)))
.collect();
}
scan_storage_units(s)
.into_iter()
.map(|(_, _, cp, width, _)| (cp, width))
.collect()
}
pub(crate) fn decode_storage_char_codes(s: &str) -> Vec<u32> {
decode_storage_units(s)
.into_iter()
.map(|(cp, _)| cp)
.collect()
}
pub(crate) fn storage_string_display_width(s: &str) -> usize {
decode_storage_units(s)
.into_iter()
.map(|(_, width)| width)
.sum()
}
pub(crate) fn storage_char_len(s: &str) -> usize {
if !storage_has_special_units(s) {
return if s.is_ascii() {
s.len()
} else {
s.chars().count()
};
}
scan_storage_units(s).len()
}
pub(crate) fn storage_byte_len(s: &str) -> usize {
if !storage_has_special_units(s) {
return s.len();
}
scan_storage_units(s)
.into_iter()
.map(|(_, _, _, _, byte_len)| byte_len)
.sum()
}
pub(crate) fn storage_char_to_byte(s: &str, char_idx: usize) -> usize {
if !storage_has_special_units(s) {
return plain_utf8_char_to_byte(s, char_idx);
}
let units = scan_storage_units(s);
if char_idx >= units.len() {
s.len()
} else {
units[char_idx].0
}
}
pub(crate) fn storage_byte_to_char(s: &str, byte_pos: usize) -> usize {
if !storage_has_special_units(s) {
let clamped = byte_pos.min(s.len());
if s.is_ascii() {
return clamped;
}
return s
.char_indices()
.take_while(|(idx, _)| *idx < clamped)
.count();
}
let units = scan_storage_units(s);
for (i, unit) in units.iter().enumerate() {
if byte_pos < unit.1 {
return i;
}
}
units.len()
}
pub(crate) fn storage_substring_bounds(s: &str, from: usize, to: usize) -> Option<(usize, usize)> {
if from > to {
return None;
}
if !storage_has_special_units(s) {
let char_len = if s.is_ascii() {
s.len()
} else {
s.chars().count()
};
if to > char_len {
return None;
}
let start_byte = plain_utf8_char_to_byte(s, from);
let end_byte = plain_utf8_char_to_byte(s, to);
return Some((start_byte, end_byte));
}
let units = scan_storage_units(s);
if to > units.len() {
return None;
}
let start_byte = if from == units.len() {
s.len()
} else {
units[from].0
};
let end_byte = if to == units.len() {
s.len()
} else {
units[to].0
};
Some((start_byte, end_byte))
}
pub(crate) fn storage_substring(s: &str, from: usize, to: usize) -> Option<String> {
let (start_byte, end_byte) = storage_substring_bounds(s, from, to)?;
Some(s[start_byte..end_byte].to_string())
}
fn decode_extended_sequence(chars: &mut Peekable<Chars<'_>>) -> Option<Vec<u8>> {
let len_char = chars.peek().copied()?;
let len_code = len_char as u32;
if !(EXT_SEQ_LEN_BASE + 1..=EXT_SEQ_LEN_BASE + EXT_SEQ_MAX_LEN).contains(&len_code) {
return None;
}
chars.next();
let len = (len_code - EXT_SEQ_LEN_BASE) as usize;
let mut out = Vec::with_capacity(len);
for _ in 0..len {
let b_char = chars.peek().copied()?;
let b = b_char as u32;
if !(EXT_SEQ_BYTE_BASE..=EXT_SEQ_BYTE_BASE + 0xFF).contains(&b) {
return None;
}
chars.next();
out.push((b - EXT_SEQ_BYTE_BASE) as u8);
}
Some(out)
}
fn push_octal_escape(out: &mut Vec<u8>, byte: u8) {
out.push(b'\\');
out.extend_from_slice(format!("{:03o}", byte).as_bytes());
}
fn push_escaped_literal_byte(out: &mut Vec<u8>, byte: u8) {
match byte {
b'"' => out.extend_from_slice(br#"\""#),
b'\\' => out.extend_from_slice(br#"\\"#),
0x08 => out.extend_from_slice(br#"\b"#),
b'\t' => out.extend_from_slice(br#"\t"#),
b'\n' => out.extend_from_slice(br#"\n"#),
0x0B => out.extend_from_slice(br#"\v"#),
0x0C => out.extend_from_slice(br#"\f"#),
b'\r' => out.extend_from_slice(br#"\r"#),
0x07 => out.extend_from_slice(br#"\a"#),
0x1B => out.extend_from_slice(br#"\e"#),
b if b < 0x20 || b == 0x7F => push_octal_escape(out, b),
b => out.push(b),
}
}
use super::print::PrintOptions;
pub(crate) fn format_lisp_string(s: &str) -> String {
String::from_utf8_lossy(&format_lisp_string_bytes_inner(s, &PrintOptions::default()))
.into_owned()
}
pub(crate) fn format_lisp_string_with_escape(s: &str, escape_newlines: bool) -> String {
let mut opts = PrintOptions::default();
opts.print_escape_newlines = escape_newlines;
String::from_utf8_lossy(&format_lisp_string_bytes_inner(s, &opts)).into_owned()
}
pub(crate) fn format_lisp_string_with_options(s: &str, options: &PrintOptions) -> String {
String::from_utf8_lossy(&format_lisp_string_bytes_inner(s, options)).into_owned()
}
pub(crate) fn format_lisp_string_bytes(s: &str) -> Vec<u8> {
format_lisp_string_bytes_inner(s, &PrintOptions::default())
}
fn push_octal_escape_contextual(out: &mut Vec<u8>, byte: u8, next_char: Option<char>) {
let need_three_digits = byte > 0o77
|| next_char.is_some_and(|nc| {
let nc_u32 = nc as u32;
nc_u32 < 0x80 && (b'0'..=b'7').contains(&(nc_u32 as u8))
});
let need_two_digits = byte > 0o7;
out.push(b'\\');
if need_three_digits {
out.push(b'0' + ((byte >> 6) & 7));
out.push(b'0' + ((byte >> 3) & 7));
out.push(b'0' + (byte & 7));
} else if need_two_digits {
out.push(b'0' + ((byte >> 3) & 7));
out.push(b'0' + (byte & 7));
} else {
out.push(b'0' + (byte & 7));
}
}
pub(crate) fn format_lisp_string_bytes_inner(s: &str, options: &PrintOptions) -> Vec<u8> {
let mut out = Vec::with_capacity(s.len() + 2);
out.push(b'"');
let mut need_nonhex = false;
let mut chars = s.chars().peekable();
while let Some(ch) = chars.next() {
let code = ch as u32;
if (UNIBYTE_BYTE_SENTINEL_MIN..=UNIBYTE_BYTE_SENTINEL_MAX).contains(&code) {
let byte = (code - UNIBYTE_BYTE_SENTINEL_BASE) as u8;
if byte >= 0x80 && options.print_escape_nonascii {
push_octal_escape_contextual(&mut out, byte, chars.peek().copied());
} else {
push_unibyte_literal_byte(&mut out, byte);
}
need_nonhex = false;
continue;
}
if (RAW_BYTE_SENTINEL_MIN..=RAW_BYTE_SENTINEL_MAX).contains(&code) {
let byte = (code - RAW_BYTE_SENTINEL_BASE) as u8;
push_octal_escape_contextual(&mut out, byte, chars.peek().copied());
need_nonhex = false;
continue;
}
if code == EXT_SEQ_PREFIX {
if let Some(bytes) = decode_extended_sequence(&mut chars) {
for b in bytes {
push_escaped_literal_byte(&mut out, b);
}
need_nonhex = false;
continue;
}
}
if options.print_escape_multibyte && code > 0x7F {
let hex = format!("\\x{:04x}", code);
out.extend_from_slice(hex.as_bytes());
need_nonhex = true;
continue;
}
if code <= 0x7F && is_hex_digit(code as u8) {
if need_nonhex {
out.extend_from_slice(b"\\ ");
}
out.push(code as u8);
need_nonhex = false;
continue;
}
if ch == '\n' && options.print_escape_newlines {
out.extend_from_slice(b"\\n");
need_nonhex = false;
continue;
}
if ch == '\x0c' && options.print_escape_newlines {
out.extend_from_slice(b"\\f");
need_nonhex = false;
continue;
}
if ch == '"' || ch == '\\' {
out.push(b'\\');
out.push(ch as u8);
need_nonhex = false;
continue;
}
if options.print_escape_control_characters && code <= 0x7F {
let b = code as u8;
if b < 0x20 && b != b'\t' && b != b'\n' && b != 0x0c {
push_octal_escape_contextual(&mut out, b, chars.peek().copied());
need_nonhex = false;
continue;
}
if b == 0x7f {
push_octal_escape_contextual(&mut out, b, chars.peek().copied());
need_nonhex = false;
continue;
}
}
let mut tmp = [0u8; 4];
let bytes = ch.encode_utf8(&mut tmp).as_bytes();
out.extend_from_slice(bytes);
need_nonhex = false;
}
out.push(b'"');
out
}
fn is_hex_digit(b: u8) -> bool {
b.is_ascii_hexdigit()
}
#[cfg(test)]
#[path = "string_escape_test.rs"]
mod tests;