use std::fmt;
use serde::Serialize;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct JsString(Repr);
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
enum Repr {
Utf8(String),
Wtf16(Vec<u16>),
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum JsStringRef<'a> {
Utf8(&'a str),
Wtf16(&'a [u16]),
}
impl JsString {
pub fn from_code_units(units: Vec<u16>) -> Self {
match String::from_utf16(&units) {
Ok(s) => JsString(Repr::Utf8(s)),
Err(_) => JsString(Repr::Wtf16(units)),
}
}
pub fn as_ref(&self) -> JsStringRef<'_> {
match &self.0 {
Repr::Utf8(s) => JsStringRef::Utf8(s),
Repr::Wtf16(units) => JsStringRef::Wtf16(units),
}
}
pub fn as_str(&self) -> Option<&str> {
match &self.0 {
Repr::Utf8(s) => Some(s),
Repr::Wtf16(_) => None,
}
}
pub fn code_units(&self) -> Vec<u16> {
match &self.0 {
Repr::Utf8(s) => s.encode_utf16().collect(),
Repr::Wtf16(units) => units.clone(),
}
}
pub fn len_utf16(&self) -> usize {
match &self.0 {
Repr::Utf8(s) => s.encode_utf16().count(),
Repr::Wtf16(units) => units.len(),
}
}
pub fn to_string_lossy(&self) -> String {
match &self.0 {
Repr::Utf8(s) => s.clone(),
Repr::Wtf16(units) => String::from_utf16_lossy(units),
}
}
pub fn from_marker_string(s: &str) -> Self {
const PREFIX: &[u8] = b"__SURROGATE_";
const MARKER_LEN: usize = 18;
if !s.contains("__SURROGATE_") {
return JsString(Repr::Utf8(s.to_string()));
}
let bytes = s.as_bytes();
let mut units: Vec<u16> = Vec::with_capacity(s.len());
let mut pos = 0;
let mut segment_start = 0;
while let Some(found) = s[pos..].find("__SURROGATE_") {
let idx = pos + found;
let tail = &bytes[idx..];
let well_formed = tail.len() >= MARKER_LEN
&& &tail[MARKER_LEN - 2..MARKER_LEN] == b"__"
&& tail[PREFIX.len()..PREFIX.len() + 4]
.iter()
.all(|b| b.is_ascii_hexdigit() && !b.is_ascii_lowercase());
if well_formed {
let hex = std::str::from_utf8(&tail[PREFIX.len()..PREFIX.len() + 4])
.expect("ascii hex is valid utf8");
let unit = u16::from_str_radix(hex, 16).expect("validated hex digits");
units.extend(s[segment_start..idx].encode_utf16());
units.push(unit);
pos = idx + MARKER_LEN;
segment_start = pos;
} else {
pos = idx + PREFIX.len();
}
}
units.extend(s[segment_start..].encode_utf16());
JsString::from_code_units(units)
}
pub fn to_marker_string(&self) -> String {
match &self.0 {
Repr::Utf8(s) => s.clone(),
Repr::Wtf16(units) => {
let mut out = String::with_capacity(units.len() * 2);
let mut iter = units.iter().copied().peekable();
while let Some(unit) = iter.next() {
match unit {
0xD800..=0xDBFF => {
if let Some(&next) = iter.peek() {
if (0xDC00..=0xDFFF).contains(&next) {
iter.next();
let cp = 0x10000
+ ((unit as u32 - 0xD800) << 10)
+ (next as u32 - 0xDC00);
out.push(char::from_u32(cp).expect("valid supplementary"));
continue;
}
}
out.push_str(&format!("__SURROGATE_{unit:04X}__"));
}
0xDC00..=0xDFFF => {
out.push_str(&format!("__SURROGATE_{unit:04X}__"));
}
_ => {
out.push(
char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),
);
}
}
}
out
}
}
}
pub fn to_escaped_string(&self) -> String {
match &self.0 {
Repr::Utf8(s) => s.clone(),
Repr::Wtf16(units) => {
let mut out = String::with_capacity(units.len() * 2);
let mut iter = units.iter().copied().peekable();
while let Some(unit) = iter.next() {
match unit {
0xD800..=0xDBFF => {
if let Some(&next) = iter.peek() {
if (0xDC00..=0xDFFF).contains(&next) {
iter.next();
let cp = 0x10000
+ ((unit as u32 - 0xD800) << 10)
+ (next as u32 - 0xDC00);
out.push(char::from_u32(cp).expect("valid supplementary"));
continue;
}
}
out.push_str(&format!("\\u{unit:04x}"));
}
0xDC00..=0xDFFF => {
out.push_str(&format!("\\u{unit:04x}"));
}
_ => {
out.push(
char::from_u32(unit as u32).expect("BMP non-surrogate is a char"),
);
}
}
}
out
}
}
}
}
impl From<String> for JsString {
fn from(s: String) -> Self {
JsString(Repr::Utf8(s))
}
}
impl From<&str> for JsString {
fn from(s: &str) -> Self {
JsString(Repr::Utf8(s.to_string()))
}
}
impl PartialEq<str> for JsString {
fn eq(&self, other: &str) -> bool {
self.as_str() == Some(other)
}
}
impl PartialEq<&str> for JsString {
fn eq(&self, other: &&str) -> bool {
self.as_str() == Some(*other)
}
}
impl fmt::Display for JsString {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(&self.to_escaped_string())
}
}
impl Serialize for JsString {
fn serialize<S: serde::Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
serializer.serialize_str(&self.to_marker_string())
}
}
#[cfg(test)]
mod tests {
use super::JsString;
use super::JsStringRef;
#[test]
fn as_ref_views_match_well_formedness() {
assert!(matches!(
JsString::from("plain").as_ref(),
JsStringRef::Utf8("plain")
));
assert!(matches!(
JsString::from_code_units(vec![0xD83E]).as_ref(),
JsStringRef::Wtf16(&[0xD83E])
));
assert_eq!(
JsString::from_code_units("plain".encode_utf16().collect()),
JsString::from("plain")
);
}
#[test]
fn marker_round_trip_preserves_lone_surrogates() {
let js = JsString::from_marker_string("__SURROGATE_D83E__");
assert_eq!(js.code_units(), vec![0xD83E]);
assert_eq!(js.to_marker_string(), "__SURROGATE_D83E__");
assert_eq!(js.to_escaped_string(), "\\ud83e");
}
#[test]
fn paired_halves_render_as_the_supplementary_character() {
let js = JsString::from_code_units(vec![0xD83E, 0xDD21]);
assert_eq!(js.as_str(), Some("\u{1F921}"));
}
#[test]
fn plain_strings_stay_utf8_and_compare_with_str() {
let js = JsString::from("use memo");
assert!(js == "use memo");
assert_eq!(js.to_marker_string(), "use memo");
}
#[test]
fn malformed_marker_text_is_kept_literally() {
let js = JsString::from_marker_string("__SURROGATE_XYZ__");
assert_eq!(js.as_str(), Some("__SURROGATE_XYZ__"));
}
#[test]
fn multibyte_text_after_marker_prefix_does_not_panic() {
let input = "__SURROGATE_\u{20AC}\u{20AC}";
let js = JsString::from_marker_string(input);
assert_eq!(js.as_str(), Some(input));
let truncated = "__SURROGATE_D8";
assert_eq!(
JsString::from_marker_string(truncated).as_str(),
Some(truncated)
);
let mixed = "a\u{20AC}__SURROGATE_D83E__b\u{20AC}";
let js = JsString::from_marker_string(mixed);
let mut expected: Vec<u16> = "a\u{20AC}".encode_utf16().collect();
expected.push(0xD83E);
expected.extend("b\u{20AC}".encode_utf16());
assert_eq!(js.code_units(), expected);
}
#[test]
fn lowercase_hex_markers_are_not_decoded() {
let input = "__SURROGATE_d83e__";
assert_eq!(JsString::from_marker_string(input).as_str(), Some(input));
}
}