use std::borrow::Borrow;
use std::cmp::Ordering;
use std::collections::HashMap;
use std::fmt::{self, Debug, Display, Formatter};
use std::hash::{Hash, Hasher};
use std::num::NonZeroU64;
use std::ops::Deref;
use std::sync::{LazyLock, RwLock};
const MARKER: u64 = 1 << 63;
static INTERNER: LazyLock<RwLock<Interner>> =
LazyLock::new(|| RwLock::new(Interner { seen: HashMap::new(), strings: Vec::new() }));
struct Interner {
seen: HashMap<&'static str, PicoStr>,
strings: Vec<&'static str>,
}
#[derive(Copy, Clone, Eq, PartialEq, Hash)]
pub struct PicoStr(NonZeroU64);
impl PicoStr {
pub fn intern(string: &str) -> PicoStr {
if let Ok(value) = PicoStr::try_constant(string) {
return value;
}
let mut interner = INTERNER.write().unwrap();
if let Some(&id) = interner.seen.get(string) {
return id;
}
let num = exceptions::LIST.len() + interner.strings.len() + 1;
let id = Self(NonZeroU64::new(num as u64).unwrap());
let string = Box::leak(string.to_string().into_boxed_str());
interner.seen.insert(string, id);
interner.strings.push(string);
id
}
#[track_caller]
pub const fn constant(string: &'static str) -> PicoStr {
match PicoStr::try_constant(string) {
Ok(value) => value,
Err(err) => panic!("{}", err.message()),
}
}
pub const fn try_constant(string: &str) -> Result<PicoStr, bitcode::EncodingError> {
let value = match bitcode::encode(string) {
Ok(v) => v | MARKER,
Err(e) => {
if let Some(i) = exceptions::get(string) {
i as u64 + 1
} else {
return Err(e);
}
}
};
match NonZeroU64::new(value) {
Some(value) => Ok(Self(value)),
None => unreachable!(),
}
}
pub fn resolve(self) -> ResolvedPicoStr {
let value = self.0.get();
if value & MARKER != 0 {
return bitcode::decode(value & !MARKER);
}
let index = (value - 1) as usize;
let string = if let Some(runtime) = index.checked_sub(exceptions::LIST.len()) {
INTERNER.read().unwrap().strings[runtime]
} else {
exceptions::LIST[index]
};
ResolvedPicoStr(Repr::Static(string))
}
}
impl Debug for PicoStr {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
Debug::fmt(self.resolve().as_str(), f)
}
}
mod bitcode {
use super::{Repr, ResolvedPicoStr};
const DECODE: &[u8; 32] = b"\0abcdefghijklmnopqrstuvwxyz-1234";
const ENCODE: &[u8; 256] = &{
let mut map = [0; 256];
let mut i = 0;
while i < DECODE.len() {
map[DECODE[i] as usize] = i as u8;
i += 1;
}
map
};
pub const fn encode(string: &str) -> Result<u64, EncodingError> {
let bytes = string.as_bytes();
if bytes.len() > 12 {
return Err(EncodingError::TooLong);
}
let mut num: u64 = 0;
let mut i = bytes.len();
while i > 0 {
i -= 1;
let b = bytes[i];
let v = ENCODE[b as usize];
if v == 0 {
return Err(EncodingError::BadChar);
}
num <<= 5;
num |= v as u64;
}
Ok(num)
}
pub const fn decode(mut value: u64) -> ResolvedPicoStr {
let mut buf = [0; 12];
let mut len = 0;
while value != 0 {
let v = value & 0b11111;
buf[len as usize] = DECODE[v as usize];
len += 1;
value >>= 5;
}
ResolvedPicoStr(Repr::Inline(buf, len))
}
pub enum EncodingError {
TooLong,
BadChar,
}
impl EncodingError {
pub const fn message(&self) -> &'static str {
match self {
Self::TooLong => {
"the maximum auto-internible string length is 12. \
you can add an exception to typst-utils/src/pico.rs \
to intern longer strings."
}
Self::BadChar => {
"can only auto-intern the chars 'a'-'z', '1'-'4', and '-'. \
you can add an exception to typst-utils/src/pico.rs \
to intern other strings."
}
}
}
}
}
mod exceptions {
use std::cmp::Ordering;
pub const LIST: &[&str] = &[
"cjk-latin-spacing",
"discretionary-ligatures",
"h5",
"h6",
"historical-ligatures",
"number-clearance",
"number-margin",
"numbering-scope",
"page-numbering",
"par-line-marker",
"transparentize",
];
pub const fn get(string: &str) -> Option<usize> {
let mut lo = 0;
let mut hi = LIST.len();
while lo < hi {
let mid = (lo + hi) / 2;
match strcmp(string, LIST[mid]) {
Ordering::Less => hi = mid,
Ordering::Greater => lo = mid + 1,
Ordering::Equal => return Some(mid),
}
}
None
}
const fn strcmp(a: &str, b: &str) -> Ordering {
let a = a.as_bytes();
let b = b.as_bytes();
let l = min(a.len(), b.len());
let mut i = 0;
while i < l {
if a[i] == b[i] {
i += 1;
} else if a[i] < b[i] {
return Ordering::Less;
} else {
return Ordering::Greater;
}
}
if i < b.len() {
Ordering::Less
} else if i < a.len() {
Ordering::Greater
} else {
Ordering::Equal
}
}
const fn min(a: usize, b: usize) -> usize {
if a < b {
a
} else {
b
}
}
}
pub struct ResolvedPicoStr(Repr);
enum Repr {
Inline([u8; 12], u8),
Static(&'static str),
}
impl ResolvedPicoStr {
pub fn as_str(&self) -> &str {
match &self.0 {
Repr::Inline(buf, len) => unsafe {
std::str::from_utf8_unchecked(&buf[..*len as usize])
},
Repr::Static(s) => s,
}
}
}
impl Debug for ResolvedPicoStr {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
Debug::fmt(self.as_str(), f)
}
}
impl Display for ResolvedPicoStr {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
Display::fmt(self.as_str(), f)
}
}
impl Deref for ResolvedPicoStr {
type Target = str;
fn deref(&self) -> &Self::Target {
self.as_str()
}
}
impl AsRef<str> for ResolvedPicoStr {
fn as_ref(&self) -> &str {
self.as_str()
}
}
impl Borrow<str> for ResolvedPicoStr {
fn borrow(&self) -> &str {
self.as_str()
}
}
impl Eq for ResolvedPicoStr {}
impl PartialEq for ResolvedPicoStr {
fn eq(&self, other: &Self) -> bool {
self.as_str().eq(other.as_str())
}
}
impl Ord for ResolvedPicoStr {
fn cmp(&self, other: &Self) -> Ordering {
self.as_str().cmp(other.as_str())
}
}
impl PartialOrd for ResolvedPicoStr {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Hash for ResolvedPicoStr {
fn hash<H: Hasher>(&self, state: &mut H) {
self.as_str().hash(state);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[track_caller]
fn roundtrip(s: &str) {
assert_eq!(PicoStr::intern(s).resolve().as_str(), s);
}
#[test]
fn test_pico_str() {
const H1: PicoStr = PicoStr::constant("h1");
assert_eq!(H1, PicoStr::intern("h1"));
assert_eq!(H1.resolve().as_str(), "h1");
const DISC: PicoStr = PicoStr::constant("discretionary-ligatures");
assert_eq!(DISC, PicoStr::intern("discretionary-ligatures"));
assert_eq!(DISC.resolve().as_str(), "discretionary-ligatures");
roundtrip("");
roundtrip("hi");
roundtrip("∆@<hi-10_");
roundtrip("you");
roundtrip("discretionary-ligatures");
}
#[test]
fn test_exceptions_not_bitcode_encodable() {
for s in exceptions::LIST {
assert!(
bitcode::encode(s).is_err(),
"{s:?} can be encoded with bitcode and should not be an exception"
);
}
}
#[test]
fn test_exceptions_sorted() {
for group in exceptions::LIST.windows(2) {
assert!(group[0] < group[1], "{group:?} are out of order");
}
}
#[test]
fn test_exception_find() {
for (i, s) in exceptions::LIST.iter().enumerate() {
assert_eq!(exceptions::get(s), Some(i), "wrong index for {s:?}");
}
assert_eq!(exceptions::get("a"), None);
assert_eq!(exceptions::get("another-"), None);
assert_eq!(exceptions::get("z"), None);
}
}