use std::sync::Arc;
use crate::error::{StatorError, StatorResult};
use crate::gc::trace::{Trace, Tracer};
pub trait ExternalStringResource: Send + Sync + std::fmt::Debug {
fn len(&self) -> usize;
fn is_empty(&self) -> bool {
self.len() == 0
}
fn as_one_byte(&self) -> Option<&[u8]>;
fn as_two_byte(&self) -> Option<&[u16]>;
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SeqOneByteString {
data: Vec<u8>,
}
impl SeqOneByteString {
#[inline]
pub fn new(data: Vec<u8>) -> Self {
Self { data }
}
pub fn from_latin1_str(s: &str) -> StatorResult<Self> {
let mut data = Vec::with_capacity(s.len());
for ch in s.chars() {
let code = ch as u32;
if code > 0xFF {
return Err(StatorError::TypeError(format!(
"character U+{code:04X} is outside the Latin-1 range"
)));
}
data.push(code as u8);
}
Ok(Self { data })
}
#[inline]
pub fn length(&self) -> usize {
self.data.len()
}
#[inline]
pub fn char_at(&self, index: usize) -> Option<u16> {
self.data.get(index).copied().map(u16::from)
}
pub fn to_utf8(&self) -> String {
self.data.iter().map(|&b| b as char).collect()
}
pub fn hash(&self) -> u32 {
fnv1a_hash_u16(self.data.iter().map(|&b| u16::from(b)))
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct SeqTwoByteString {
data: Vec<u16>,
}
impl SeqTwoByteString {
#[inline]
pub fn new(data: Vec<u16>) -> Self {
Self { data }
}
pub fn from_utf8(s: &str) -> Self {
Self {
data: s.encode_utf16().collect(),
}
}
#[inline]
pub fn length(&self) -> usize {
self.data.len()
}
#[inline]
pub fn char_at(&self, index: usize) -> Option<u16> {
self.data.get(index).copied()
}
pub fn to_utf8(&self) -> String {
String::from_utf16_lossy(&self.data)
}
pub fn hash(&self) -> u32 {
fnv1a_hash_u16(self.data.iter().copied())
}
}
#[derive(Debug, Clone)]
pub struct ConsString {
left: Box<JsString>,
right: Box<JsString>,
length: usize,
}
impl ConsString {
pub fn new(left: JsString, right: JsString) -> Self {
let length = left.length() + right.length();
Self {
left: Box::new(left),
right: Box::new(right),
length,
}
}
#[inline]
pub fn length(&self) -> usize {
self.length
}
pub fn char_at(&self, index: usize) -> Option<u16> {
if index >= self.length {
return None;
}
let left_len = self.left.length();
if index < left_len {
self.left.char_at(index)
} else {
self.right.char_at(index - left_len)
}
}
pub fn to_utf8(&self) -> String {
let mut s = self.left.to_utf8();
s.push_str(&self.right.to_utf8());
s
}
pub fn flatten(&self) -> JsString {
JsString::new(&self.to_utf8())
}
pub fn hash(&self) -> u32 {
self.flatten().hash()
}
}
#[derive(Debug, Clone)]
pub struct SlicedString {
parent: Box<JsString>,
offset: usize,
length: usize,
}
impl SlicedString {
pub fn new(parent: JsString, offset: usize, length: usize) -> StatorResult<Self> {
let parent_len = parent.length();
let end = offset.checked_add(length);
if end.is_none_or(|e| e > parent_len) {
return Err(StatorError::RangeError(format!(
"slice [offset={offset}, length={length}) is out of range for a string of length {parent_len}",
)));
}
Ok(Self {
parent: Box::new(parent),
offset,
length,
})
}
#[inline]
pub fn length(&self) -> usize {
self.length
}
#[inline]
pub fn char_at(&self, index: usize) -> Option<u16> {
if index >= self.length {
return None;
}
self.parent.char_at(self.offset + index)
}
pub fn to_utf8(&self) -> String {
let units: Vec<u16> = (self.offset..self.offset + self.length)
.filter_map(|i| self.parent.char_at(i))
.collect();
String::from_utf16_lossy(&units)
}
pub fn flatten(&self) -> JsString {
JsString::new(&self.to_utf8())
}
pub fn hash(&self) -> u32 {
self.flatten().hash()
}
}
#[derive(Debug, Clone)]
pub struct ExternalString {
resource: Arc<dyn ExternalStringResource>,
}
impl ExternalString {
pub fn new(resource: Arc<dyn ExternalStringResource>) -> Self {
Self { resource }
}
#[inline]
pub fn length(&self) -> usize {
self.resource.len()
}
pub fn char_at(&self, index: usize) -> Option<u16> {
if let Some(bytes) = self.resource.as_one_byte() {
bytes.get(index).copied().map(u16::from)
} else if let Some(units) = self.resource.as_two_byte() {
units.get(index).copied()
} else {
None
}
}
pub fn to_utf8(&self) -> String {
if let Some(bytes) = self.resource.as_one_byte() {
bytes.iter().map(|&b| b as char).collect()
} else if let Some(units) = self.resource.as_two_byte() {
String::from_utf16_lossy(units)
} else {
String::new()
}
}
pub fn flatten(&self) -> JsString {
JsString::new(&self.to_utf8())
}
pub fn hash(&self) -> u32 {
if let Some(bytes) = self.resource.as_one_byte() {
fnv1a_hash_u16(bytes.iter().map(|&b| u16::from(b)))
} else if let Some(units) = self.resource.as_two_byte() {
fnv1a_hash_u16(units.iter().copied())
} else {
fnv1a_hash_u16(std::iter::empty())
}
}
}
#[derive(Debug, Clone)]
pub enum JsString {
SeqOneByte(SeqOneByteString),
SeqTwoByte(SeqTwoByteString),
Cons(ConsString),
Sliced(SlicedString),
External(ExternalString),
}
impl JsString {
pub fn new(s: &str) -> Self {
match SeqOneByteString::from_latin1_str(s) {
Ok(one_byte) => JsString::SeqOneByte(one_byte),
Err(_) => JsString::SeqTwoByte(SeqTwoByteString::from_utf8(s)),
}
}
pub fn concat(self, other: JsString) -> Self {
JsString::Cons(ConsString::new(self, other))
}
pub fn length(&self) -> usize {
match self {
Self::SeqOneByte(s) => s.length(),
Self::SeqTwoByte(s) => s.length(),
Self::Cons(s) => s.length(),
Self::Sliced(s) => s.length(),
Self::External(s) => s.length(),
}
}
pub fn char_at(&self, index: usize) -> Option<u16> {
match self {
Self::SeqOneByte(s) => s.char_at(index),
Self::SeqTwoByte(s) => s.char_at(index),
Self::Cons(s) => s.char_at(index),
Self::Sliced(s) => s.char_at(index),
Self::External(s) => s.char_at(index),
}
}
pub fn flatten(&self) -> JsString {
match self {
Self::SeqOneByte(_) | Self::SeqTwoByte(_) => self.clone(),
Self::Cons(s) => s.flatten(),
Self::Sliced(s) => s.flatten(),
Self::External(s) => s.flatten(),
}
}
pub fn to_utf8(&self) -> String {
match self {
Self::SeqOneByte(s) => s.to_utf8(),
Self::SeqTwoByte(s) => s.to_utf8(),
Self::Cons(s) => s.to_utf8(),
Self::Sliced(s) => s.to_utf8(),
Self::External(s) => s.to_utf8(),
}
}
pub fn hash(&self) -> u32 {
match self {
Self::SeqOneByte(s) => s.hash(),
Self::SeqTwoByte(s) => s.hash(),
Self::Cons(s) => s.hash(),
Self::Sliced(s) => s.hash(),
Self::External(s) => s.hash(),
}
}
}
fn fnv1a_hash_u16(units: impl Iterator<Item = u16>) -> u32 {
const FNV_OFFSET: u32 = 2_166_136_261;
const FNV_PRIME: u32 = 16_777_619;
let mut hash = FNV_OFFSET;
for unit in units {
hash ^= u32::from(unit & 0xFF);
hash = hash.wrapping_mul(FNV_PRIME);
hash ^= u32::from(unit >> 8);
hash = hash.wrapping_mul(FNV_PRIME);
}
hash
}
impl Trace for JsString {
#[allow(clippy::only_used_in_recursion)]
fn trace(&self, tracer: &mut Tracer) {
match self {
Self::SeqOneByte(_) | Self::SeqTwoByte(_) | Self::External(_) => {
}
Self::Cons(cons) => {
cons.left.trace(tracer);
cons.right.trace(tracer);
}
Self::Sliced(sliced) => {
sliced.parent.trace(tracer);
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_seq_one_byte_from_str_ascii() {
let s = SeqOneByteString::from_latin1_str("hello").unwrap();
assert_eq!(s.length(), 5);
assert_eq!(s.to_utf8(), "hello");
}
#[test]
fn test_seq_one_byte_from_str_latin1() {
let s = SeqOneByteString::from_latin1_str("caf\u{00E9}").unwrap(); assert_eq!(s.length(), 4);
assert_eq!(s.to_utf8(), "café");
}
#[test]
fn test_seq_one_byte_from_str_rejects_non_latin1() {
let result = SeqOneByteString::from_latin1_str("こんにちは");
assert!(matches!(result, Err(StatorError::TypeError(_))));
}
#[test]
fn test_seq_one_byte_char_at() {
let s = SeqOneByteString::from_latin1_str("abc").unwrap();
assert_eq!(s.char_at(0), Some(b'a' as u16));
assert_eq!(s.char_at(2), Some(b'c' as u16));
assert_eq!(s.char_at(3), None);
}
#[test]
fn test_seq_one_byte_hash_same_content() {
let a = SeqOneByteString::from_latin1_str("test").unwrap();
let b = SeqOneByteString::from_latin1_str("test").unwrap();
assert_eq!(a.hash(), b.hash());
}
#[test]
fn test_seq_one_byte_hash_different_content() {
let a = SeqOneByteString::from_latin1_str("test").unwrap();
let b = SeqOneByteString::from_latin1_str("Test").unwrap();
assert_ne!(a.hash(), b.hash());
}
#[test]
fn test_seq_two_byte_from_str_ascii() {
let s = SeqTwoByteString::from_utf8("hello");
assert_eq!(s.length(), 5);
assert_eq!(s.to_utf8(), "hello");
}
#[test]
fn test_seq_two_byte_from_str_emoji() {
let s = SeqTwoByteString::from_utf8("😀");
assert_eq!(s.length(), 2);
assert_eq!(s.to_utf8(), "😀");
}
#[test]
fn test_seq_two_byte_from_str_japanese() {
let s = SeqTwoByteString::from_utf8("こんにちは");
assert_eq!(s.length(), 5);
assert_eq!(s.to_utf8(), "こんにちは");
}
#[test]
fn test_seq_two_byte_char_at() {
let s = SeqTwoByteString::new(vec![0x0048, 0x0069]); assert_eq!(s.char_at(0), Some(0x0048));
assert_eq!(s.char_at(1), Some(0x0069));
assert_eq!(s.char_at(2), None);
}
#[test]
fn test_seq_two_byte_hash_same_content() {
let a = SeqTwoByteString::from_utf8("hello");
let b = SeqTwoByteString::from_utf8("hello");
assert_eq!(a.hash(), b.hash());
}
#[test]
fn test_hash_consistent_one_byte_vs_two_byte() {
let one = JsString::SeqOneByte(SeqOneByteString::from_latin1_str("abc").unwrap());
let two = JsString::SeqTwoByte(SeqTwoByteString::from_utf8("abc"));
assert_eq!(one.hash(), two.hash());
}
#[test]
fn test_cons_concatenation_length() {
let a = JsString::new("hello");
let b = JsString::new(" world");
let c = a.concat(b);
assert_eq!(c.length(), 11);
}
#[test]
fn test_cons_to_utf8() {
let a = JsString::new("foo");
let b = JsString::new("bar");
let c = a.concat(b);
assert_eq!(c.to_utf8(), "foobar");
}
#[test]
fn test_cons_char_at() {
let a = JsString::new("ab");
let b = JsString::new("cd");
let c = a.concat(b);
assert_eq!(c.char_at(0), Some(b'a' as u16));
assert_eq!(c.char_at(1), Some(b'b' as u16));
assert_eq!(c.char_at(2), Some(b'c' as u16));
assert_eq!(c.char_at(3), Some(b'd' as u16));
assert_eq!(c.char_at(4), None);
}
#[test]
fn test_cons_flatten_returns_flat_string() {
let a = JsString::new("hello");
let b = JsString::new(" world");
let cons = a.concat(b);
let flat = cons.flatten();
assert!(matches!(
flat,
JsString::SeqOneByte(_) | JsString::SeqTwoByte(_)
));
assert_eq!(flat.to_utf8(), "hello world");
}
#[test]
fn test_cons_hash_matches_flat() {
let a = JsString::new("foo");
let b = JsString::new("bar");
let cons = a.clone().concat(b.clone());
let flat = JsString::new("foobar");
assert_eq!(cons.hash(), flat.hash());
}
#[test]
fn test_cons_empty_strings() {
let a = JsString::new("");
let b = JsString::new("hello");
let c = a.concat(b);
assert_eq!(c.length(), 5);
assert_eq!(c.to_utf8(), "hello");
}
#[test]
fn test_sliced_basic_slice() {
let parent = JsString::new("hello world");
let slice = SlicedString::new(parent, 6, 5).unwrap();
assert_eq!(slice.length(), 5);
assert_eq!(slice.to_utf8(), "world");
}
#[test]
fn test_sliced_char_at() {
let parent = JsString::new("abcdef");
let slice = SlicedString::new(parent, 2, 3).unwrap(); assert_eq!(slice.char_at(0), Some(b'c' as u16));
assert_eq!(slice.char_at(1), Some(b'd' as u16));
assert_eq!(slice.char_at(2), Some(b'e' as u16));
assert_eq!(slice.char_at(3), None);
}
#[test]
fn test_sliced_out_of_bounds_returns_error() {
let parent = JsString::new("hi");
let result = SlicedString::new(parent, 1, 5);
assert!(matches!(result, Err(StatorError::RangeError(_))));
}
#[test]
fn test_sliced_offset_overflow_returns_error() {
let parent = JsString::new("hi");
let result = SlicedString::new(parent, usize::MAX, 1);
assert!(matches!(result, Err(StatorError::RangeError(_))));
}
#[test]
fn test_sliced_flatten_returns_flat_string() {
let parent = JsString::new("hello world");
let slice = SlicedString::new(parent, 0, 5).unwrap();
let flat = slice.flatten();
assert!(matches!(
flat,
JsString::SeqOneByte(_) | JsString::SeqTwoByte(_)
));
assert_eq!(flat.to_utf8(), "hello");
}
#[test]
fn test_sliced_hash_matches_flat() {
let parent = JsString::new("hello world");
let slice = JsString::Sliced(SlicedString::new(parent, 6, 5).unwrap());
let flat = JsString::new("world");
assert_eq!(slice.hash(), flat.hash());
}
#[test]
fn test_sliced_empty_slice() {
let parent = JsString::new("hello");
let slice = SlicedString::new(parent, 2, 0).unwrap();
assert_eq!(slice.length(), 0);
assert_eq!(slice.to_utf8(), "");
}
#[derive(Debug)]
struct TestOneByteResource(Vec<u8>);
impl ExternalStringResource for TestOneByteResource {
fn len(&self) -> usize {
self.0.len()
}
fn as_one_byte(&self) -> Option<&[u8]> {
Some(&self.0)
}
fn as_two_byte(&self) -> Option<&[u16]> {
None
}
}
#[derive(Debug)]
struct TestTwoByteResource(Vec<u16>);
impl ExternalStringResource for TestTwoByteResource {
fn len(&self) -> usize {
self.0.len()
}
fn as_one_byte(&self) -> Option<&[u8]> {
None
}
fn as_two_byte(&self) -> Option<&[u16]> {
Some(&self.0)
}
}
#[test]
fn test_external_one_byte_length_and_to_utf8() {
let res: Arc<dyn ExternalStringResource> = Arc::new(TestOneByteResource(b"hello".to_vec()));
let s = ExternalString::new(res);
assert_eq!(s.length(), 5);
assert_eq!(s.to_utf8(), "hello");
}
#[test]
fn test_external_two_byte_length_and_to_utf8() {
let units: Vec<u16> = "こんにちは".encode_utf16().collect();
let res: Arc<dyn ExternalStringResource> = Arc::new(TestTwoByteResource(units));
let s = ExternalString::new(res);
assert_eq!(s.length(), 5);
assert_eq!(s.to_utf8(), "こんにちは");
}
#[test]
fn test_external_char_at() {
let res: Arc<dyn ExternalStringResource> = Arc::new(TestOneByteResource(b"abc".to_vec()));
let s = ExternalString::new(res);
assert_eq!(s.char_at(0), Some(b'a' as u16));
assert_eq!(s.char_at(2), Some(b'c' as u16));
assert_eq!(s.char_at(3), None);
}
#[test]
fn test_external_flatten_returns_flat_string() {
let res: Arc<dyn ExternalStringResource> = Arc::new(TestOneByteResource(b"world".to_vec()));
let s = ExternalString::new(res);
let flat = s.flatten();
assert!(matches!(
flat,
JsString::SeqOneByte(_) | JsString::SeqTwoByte(_)
));
assert_eq!(flat.to_utf8(), "world");
}
#[test]
fn test_external_hash_matches_seq() {
let res: Arc<dyn ExternalStringResource> = Arc::new(TestOneByteResource(b"hello".to_vec()));
let ext = JsString::External(ExternalString::new(res));
let seq = JsString::new("hello");
assert_eq!(ext.hash(), seq.hash());
}
#[test]
fn test_utf8_roundtrip_ascii() {
let original = "The quick brown fox";
assert_eq!(JsString::new(original).to_utf8(), original);
}
#[test]
fn test_utf8_roundtrip_latin1() {
let original = "résumé";
assert_eq!(JsString::new(original).to_utf8(), original);
}
#[test]
fn test_utf8_roundtrip_japanese() {
let original = "日本語";
assert_eq!(JsString::new(original).to_utf8(), original);
}
#[test]
fn test_utf8_roundtrip_emoji() {
let original = "Hello 🌍!";
assert_eq!(JsString::new(original).to_utf8(), original);
}
#[test]
fn test_utf8_roundtrip_cons() {
let a = JsString::new("Hello");
let b = JsString::new(", 世界!");
let c = a.concat(b);
assert_eq!(c.to_utf8(), "Hello, 世界!");
}
#[test]
fn test_utf8_roundtrip_sliced() {
let parent = JsString::new("Hello, world!");
let sliced = JsString::Sliced(SlicedString::new(parent, 7, 5).unwrap());
assert_eq!(sliced.to_utf8(), "world");
}
#[test]
fn test_from_str_selects_one_byte_for_ascii() {
let s = JsString::new("ascii");
assert!(matches!(s, JsString::SeqOneByte(_)));
}
#[test]
fn test_from_str_selects_one_byte_for_latin1() {
let s = JsString::new("caf\u{00E9}");
assert!(matches!(s, JsString::SeqOneByte(_)));
}
#[test]
fn test_from_str_selects_two_byte_for_non_latin1() {
let s = JsString::new("日本語");
assert!(matches!(s, JsString::SeqTwoByte(_)));
}
#[test]
fn test_length_empty_string() {
assert_eq!(JsString::new("").length(), 0);
}
#[test]
fn test_char_at_empty_string() {
assert_eq!(JsString::new("").char_at(0), None);
}
}