use std::collections::HashMap;
use super::functions::{
compat_fold, is_combining, rope_collect, rope_depth, rope_node_len, FNV_OFFSET_BASIS, FNV_PRIME,
};
#[allow(dead_code)]
struct TrieNode {
children: HashMap<char, Box<TrieNode>>,
terminal: Option<InternedString>,
}
#[allow(dead_code)]
impl TrieNode {
fn new() -> Self {
Self {
children: HashMap::new(),
terminal: None,
}
}
}
#[allow(dead_code)]
#[derive(Clone, Debug)]
pub struct Fnv1aHasher {
state: u64,
}
#[allow(dead_code)]
impl Fnv1aHasher {
pub fn new() -> Self {
Self {
state: FNV_OFFSET_BASIS,
}
}
pub fn write_byte(&mut self, byte: u8) {
self.state ^= byte as u64;
self.state = self.state.wrapping_mul(FNV_PRIME);
}
pub fn write_str(&mut self, s: &str) {
for byte in s.as_bytes() {
self.write_byte(*byte);
}
}
pub fn finish(&self) -> u64 {
self.state
}
pub fn hash_str(s: &str) -> u64 {
let mut h = Self::new();
h.write_str(s);
h.finish()
}
pub fn hash_str_32(s: &str) -> u32 {
let h = Self::hash_str(s);
((h >> 32) ^ h) as u32
}
}
#[derive(Clone, Debug, Default, PartialEq, Eq)]
pub struct PoolStatistics {
pub unique_count: usize,
pub unique_bytes: usize,
pub total_intern_requests: usize,
pub total_requested_bytes: usize,
}
impl PoolStatistics {
pub fn bytes_saved(&self) -> usize {
self.total_requested_bytes.saturating_sub(self.unique_bytes)
}
pub fn dedup_ratio(&self) -> f64 {
if self.total_requested_bytes == 0 {
0.0
} else {
self.bytes_saved() as f64 / self.total_requested_bytes as f64
}
}
pub fn avg_string_len(&self) -> f64 {
if self.unique_count == 0 {
0.0
} else {
self.unique_bytes as f64 / self.unique_count as f64
}
}
}
pub struct StringPool {
map: HashMap<String, u32>,
pub(super) strings: Vec<String>,
stats: PoolStatistics,
}
impl StringPool {
pub fn new() -> Self {
Self {
map: HashMap::new(),
strings: Vec::new(),
stats: PoolStatistics::default(),
}
}
pub fn with_capacity(cap: usize) -> Self {
Self {
map: HashMap::with_capacity(cap),
strings: Vec::with_capacity(cap),
stats: PoolStatistics::default(),
}
}
pub fn intern(&mut self, s: &str) -> InternedString {
self.stats.total_intern_requests += 1;
self.stats.total_requested_bytes += s.len();
if let Some(&idx) = self.map.get(s) {
return InternedString { index: idx };
}
let idx = self.strings.len() as u32;
self.strings.push(s.to_string());
self.map.insert(s.to_string(), idx);
self.stats.unique_count += 1;
self.stats.unique_bytes += s.len();
InternedString { index: idx }
}
pub fn intern_bulk(&mut self, strs: &[&str]) -> Vec<InternedString> {
strs.iter().map(|s| self.intern(s)).collect()
}
pub fn intern_iter<'a, I>(&mut self, iter: I) -> Vec<InternedString>
where
I: IntoIterator<Item = &'a str>,
{
iter.into_iter().map(|s| self.intern(s)).collect()
}
pub fn resolve(&self, id: InternedString) -> Option<&str> {
self.strings.get(id.index as usize).map(|s| s.as_str())
}
pub fn lookup(&self, s: &str) -> Option<InternedString> {
self.map.get(s).map(|&idx| InternedString { index: idx })
}
pub fn contains(&self, s: &str) -> bool {
self.map.contains_key(s)
}
pub fn len(&self) -> usize {
self.strings.len()
}
pub fn is_empty(&self) -> bool {
self.strings.is_empty()
}
pub fn statistics(&self) -> &PoolStatistics {
&self.stats
}
pub fn snapshot(&self) -> PoolSnapshot {
PoolSnapshot {
strings: self.strings.clone(),
}
}
pub fn iter(&self) -> impl Iterator<Item = (InternedString, &str)> {
self.strings
.iter()
.enumerate()
.map(|(i, s)| (InternedString { index: i as u32 }, s.as_str()))
}
pub fn clear(&mut self) {
self.map.clear();
self.strings.clear();
self.stats.unique_count = 0;
self.stats.unique_bytes = 0;
}
pub fn merge(&mut self, other: &StringPool) -> Vec<InternedString> {
other.strings.iter().map(|s| self.intern(s)).collect()
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub struct GenerationalString {
index: u32,
generation: u16,
}
#[allow(dead_code)]
impl GenerationalString {
pub fn index(self) -> u32 {
self.index
}
pub fn generation(self) -> u16 {
self.generation
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum StringCategory {
AlphaNum,
Alpha,
Numeric,
Whitespace,
Empty,
Identifier,
Mixed,
}
#[allow(dead_code)]
pub struct TriePool {
root: TrieNode,
pool: StringPool,
}
#[allow(dead_code)]
impl TriePool {
pub fn new() -> Self {
Self {
root: TrieNode::new(),
pool: StringPool::new(),
}
}
pub fn insert(&mut self, s: &str) -> InternedString {
let id = self.pool.intern(s);
let mut node = &mut self.root;
for ch in s.chars() {
node = node
.children
.entry(ch)
.or_insert_with(|| Box::new(TrieNode::new()));
}
node.terminal = Some(id);
id
}
fn collect_all(node: &TrieNode, result: &mut Vec<InternedString>) {
if let Some(id) = node.terminal {
result.push(id);
}
for child in node.children.values() {
Self::collect_all(child, result);
}
}
pub fn find_prefix(&self, prefix: &str) -> Vec<InternedString> {
let mut node = &self.root;
for ch in prefix.chars() {
match node.children.get(&ch) {
Some(child) => node = child,
None => return Vec::new(),
}
}
let mut result = Vec::new();
Self::collect_all(node, &mut result);
result
}
pub fn get(&self, s: &str) -> Option<InternedString> {
self.pool.lookup(s)
}
pub fn resolve(&self, id: InternedString) -> Option<&str> {
self.pool.resolve(id)
}
pub fn len(&self) -> usize {
self.pool.len()
}
pub fn is_empty(&self) -> bool {
self.pool.is_empty()
}
}
#[allow(dead_code)]
pub struct PoolGrowthEstimator {
history: Vec<usize>,
window: usize,
}
#[allow(dead_code)]
impl PoolGrowthEstimator {
pub fn new(window: usize) -> Self {
Self {
history: Vec::new(),
window: window.max(1),
}
}
pub fn record(&mut self, size: usize) {
self.history.push(size);
if self.history.len() > self.window * 2 {
let drain_to = self.history.len() - self.window;
self.history.drain(..drain_to);
}
}
pub fn avg_growth(&self) -> f64 {
if self.history.len() < 2 {
return 0.0;
}
let _n = self.history.len();
let deltas: Vec<f64> = self
.history
.windows(2)
.map(|w| w[1] as f64 - w[0] as f64)
.collect();
deltas.iter().sum::<f64>() / deltas.len() as f64
}
pub fn estimate_after(&self, n: usize) -> f64 {
let last = self.history.last().copied().unwrap_or(0) as f64;
last + self.avg_growth() * n as f64
}
pub fn observation_count(&self) -> usize {
self.history.len()
}
}
#[allow(dead_code)]
#[derive(Clone, Debug)]
pub struct Rope {
pub(super) root: Option<RopeNode>,
}
#[allow(dead_code)]
impl Rope {
pub fn new() -> Self {
Self { root: None }
}
pub fn from_str(s: &str) -> Self {
if s.is_empty() {
Self { root: None }
} else {
Self {
root: Some(RopeNode::Leaf(s.to_string())),
}
}
}
pub fn len(&self) -> usize {
match &self.root {
None => 0,
Some(node) => rope_node_len(node),
}
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn concat(self, other: Rope) -> Rope {
match (self.root, other.root) {
(None, r) => Rope { root: r },
(l, None) => Rope { root: l },
(Some(l), Some(r)) => {
let total = rope_node_len(&l) + rope_node_len(&r);
Rope {
root: Some(RopeNode::Concat(Box::new(l), Box::new(r), total)),
}
}
}
}
pub fn append_str(self, s: &str) -> Rope {
self.concat(Rope::from_str(s))
}
pub fn collect_string(&self) -> String {
use std::fmt::Write;
let mut buf = String::with_capacity(self.len());
if let Some(node) = &self.root {
rope_collect(node, &mut buf);
}
buf
}
pub fn depth(&self) -> usize {
match &self.root {
None => 0,
Some(node) => rope_depth(node),
}
}
}
#[allow(dead_code)]
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct PoolDiff {
pub added: Vec<String>,
pub removed: Vec<String>,
pub common: Vec<String>,
}
#[allow(dead_code)]
impl PoolDiff {
pub fn compute(old: &PoolSnapshot, new: &PoolSnapshot) -> Self {
use std::collections::HashSet;
let old_set: HashSet<&str> = old.strings.iter().map(|s| s.as_str()).collect();
let new_set: HashSet<&str> = new.strings.iter().map(|s| s.as_str()).collect();
let added = new_set
.difference(&old_set)
.map(|s| s.to_string())
.collect();
let removed = old_set
.difference(&new_set)
.map(|s| s.to_string())
.collect();
let common = old_set
.intersection(&new_set)
.map(|s| s.to_string())
.collect();
Self {
added,
removed,
common,
}
}
pub fn added_count(&self) -> usize {
self.added.len()
}
pub fn removed_count(&self) -> usize {
self.removed.len()
}
pub fn is_empty(&self) -> bool {
self.added.is_empty() && self.removed.is_empty()
}
}
#[allow(dead_code)]
pub struct CaseFoldPool {
inner: StringPool,
}
#[allow(dead_code)]
impl CaseFoldPool {
pub fn new() -> Self {
Self {
inner: StringPool::new(),
}
}
pub fn intern(&mut self, s: &str) -> InternedString {
let folded = s.to_lowercase();
self.inner.intern(&folded)
}
pub fn resolve(&self, id: InternedString) -> Option<&str> {
self.inner.resolve(id)
}
pub fn contains(&self, s: &str) -> bool {
self.inner.contains(&s.to_lowercase())
}
pub fn len(&self) -> usize {
self.inner.len()
}
pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}
pub fn inner(&self) -> &StringPool {
&self.inner
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum NormForm {
Nfc,
Nfd,
Nfkc,
Nfkd,
}
#[derive(Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
pub struct InternedString {
pub(super) index: u32,
}
impl InternedString {
pub fn from_raw(index: u32) -> Self {
Self { index }
}
pub fn raw_index(self) -> u32 {
self.index
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct PoolSnapshot {
pub strings: Vec<String>,
}
impl PoolSnapshot {
pub fn len(&self) -> usize {
self.strings.len()
}
pub fn is_empty(&self) -> bool {
self.strings.is_empty()
}
pub fn get(&self, id: InternedString) -> Option<&str> {
self.strings.get(id.index as usize).map(|s| s.as_str())
}
pub fn restore(&self) -> StringPool {
let mut pool = StringPool::new();
for s in &self.strings {
pool.intern(s);
}
pool
}
pub fn total_bytes(&self) -> usize {
self.strings.iter().map(|s| s.len()).sum()
}
pub fn encode(&self) -> Vec<u8> {
let mut buf = Vec::new();
let count = self.strings.len() as u32;
buf.extend_from_slice(&count.to_le_bytes());
for s in &self.strings {
let len = s.len() as u32;
buf.extend_from_slice(&len.to_le_bytes());
buf.extend_from_slice(s.as_bytes());
}
buf
}
pub fn decode(data: &[u8]) -> Option<Self> {
if data.len() < 4 {
return None;
}
let count = u32::from_le_bytes([data[0], data[1], data[2], data[3]]) as usize;
let mut pos = 4;
let mut strings = Vec::with_capacity(count);
for _ in 0..count {
if pos + 4 > data.len() {
return None;
}
let len = u32::from_le_bytes([data[pos], data[pos + 1], data[pos + 2], data[pos + 3]])
as usize;
pos += 4;
if pos + len > data.len() {
return None;
}
let s = std::str::from_utf8(&data[pos..pos + len]).ok()?;
strings.push(s.to_string());
pos += len;
}
Some(Self { strings })
}
}
#[allow(dead_code)]
pub struct NormalizedPool {
form: NormForm,
inner: StringPool,
}
#[allow(dead_code)]
impl NormalizedPool {
pub fn new(form: NormForm) -> Self {
Self {
form,
inner: StringPool::new(),
}
}
pub fn intern(&mut self, s: &str) -> InternedString {
let normalized = self.normalize(s);
self.inner.intern(&normalized)
}
pub fn normalize(&self, s: &str) -> String {
match self.form {
NormForm::Nfc | NormForm::Nfd => s.chars().filter(|c| !is_combining(*c)).collect(),
NormForm::Nfkc | NormForm::Nfkd => s
.chars()
.filter(|c| !is_combining(*c))
.map(compat_fold)
.collect(),
}
}
pub fn resolve(&self, id: InternedString) -> Option<&str> {
self.inner.resolve(id)
}
pub fn len(&self) -> usize {
self.inner.len()
}
pub fn is_empty(&self) -> bool {
self.inner.is_empty()
}
}
#[allow(dead_code)]
#[derive(Clone, Debug)]
pub(super) enum RopeNode {
Leaf(String),
Concat(Box<RopeNode>, Box<RopeNode>, usize),
}
#[allow(dead_code)]
pub struct PrefixPool {
pool: StringPool,
}
#[allow(dead_code)]
impl PrefixPool {
pub fn new() -> Self {
Self {
pool: StringPool::new(),
}
}
pub fn intern(&mut self, s: &str) -> InternedString {
self.pool.intern(s)
}
pub fn longest_prefix(&self, s: &str) -> Option<InternedString> {
let mut best: Option<InternedString> = None;
let mut best_len = 0usize;
for (id, interned) in self.pool.iter() {
if s.starts_with(interned) && interned.len() >= best_len {
best_len = interned.len();
best = Some(id);
}
}
best
}
pub fn all_prefixes(&self, s: &str) -> Vec<InternedString> {
let mut result: Vec<(usize, InternedString)> = self
.pool
.iter()
.filter(|(_, interned)| s.starts_with(*interned))
.map(|(id, interned)| (interned.len(), id))
.collect();
result.sort_by_key(|b| std::cmp::Reverse(b.0));
result.into_iter().map(|(_, id)| id).collect()
}
pub fn resolve(&self, id: InternedString) -> Option<&str> {
self.pool.resolve(id)
}
pub fn len(&self) -> usize {
self.pool.len()
}
pub fn is_empty(&self) -> bool {
self.pool.is_empty()
}
}
#[allow(dead_code)]
pub struct StringIndex {
sorted: Vec<(String, InternedString)>,
}
#[allow(dead_code)]
impl StringIndex {
pub fn build(pool: &StringPool) -> Self {
let mut sorted: Vec<(String, InternedString)> =
pool.iter().map(|(id, s)| (s.to_string(), id)).collect();
sorted.sort_by(|a, b| a.0.cmp(&b.0));
Self { sorted }
}
pub fn find_prefix(&self, prefix: &str) -> Vec<InternedString> {
let start = self.sorted.partition_point(|(s, _)| s.as_str() < prefix);
self.sorted[start..]
.iter()
.take_while(|(s, _)| s.starts_with(prefix))
.map(|(_, id)| *id)
.collect()
}
pub fn find_suffix(&self, suffix: &str) -> Vec<InternedString> {
self.sorted
.iter()
.filter(|(s, _)| s.ends_with(suffix))
.map(|(_, id)| *id)
.collect()
}
pub fn find_contains(&self, sub: &str) -> Vec<InternedString> {
self.sorted
.iter()
.filter(|(s, _)| s.contains(sub))
.map(|(_, id)| *id)
.collect()
}
pub fn len(&self) -> usize {
self.sorted.len()
}
pub fn is_empty(&self) -> bool {
self.sorted.is_empty()
}
}
#[allow(dead_code)]
pub struct PoolWriter;
#[allow(dead_code)]
impl PoolWriter {
pub fn write_to_string(pool: &StringPool) -> String {
let mut out = String::new();
for (id, s) in pool.iter() {
out.push_str(&format!("{}: {:?}\n", id.raw_index(), s));
}
out
}
pub fn write_stats_to_string(pool: &StringPool) -> String {
format!("{}", pool.statistics())
}
}
#[allow(dead_code)]
pub struct PoolPartition {
pub matching: Vec<InternedString>,
pub non_matching: Vec<InternedString>,
}
#[allow(dead_code)]
impl PoolPartition {
pub fn by<F>(pool: &StringPool, mut pred: F) -> Self
where
F: FnMut(&str) -> bool,
{
let mut matching = Vec::new();
let mut non_matching = Vec::new();
for (id, s) in pool.iter() {
if pred(s) {
matching.push(id);
} else {
non_matching.push(id);
}
}
Self {
matching,
non_matching,
}
}
pub fn matching_count(&self) -> usize {
self.matching.len()
}
pub fn non_matching_count(&self) -> usize {
self.non_matching.len()
}
}
#[allow(dead_code)]
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub struct InternedSlice {
pub base: InternedString,
pub start: u32,
pub end: u32,
}
#[allow(dead_code)]
impl InternedSlice {
pub fn new(base: InternedString, start: u32, end: u32) -> Self {
Self { base, start, end }
}
pub fn len(&self) -> usize {
(self.end - self.start) as usize
}
pub fn is_empty(&self) -> bool {
self.start >= self.end
}
pub fn resolve<'a>(&self, pool: &'a StringPool) -> Option<&'a str> {
let base = pool.resolve(self.base)?;
let start = self.start as usize;
let end = self.end as usize;
if end > base.len() {
return None;
}
base.get(start..end)
}
}
#[allow(dead_code)]
pub struct GenerationalPool {
pool: StringPool,
generation: u16,
ref_counts: Vec<u32>,
}
#[allow(dead_code)]
impl GenerationalPool {
pub fn new() -> Self {
Self {
pool: StringPool::new(),
generation: 0,
ref_counts: Vec::new(),
}
}
pub fn intern(&mut self, s: &str) -> GenerationalString {
let id = self.pool.intern(s);
let idx = id.raw_index() as usize;
while self.ref_counts.len() <= idx {
self.ref_counts.push(0);
}
self.ref_counts[idx] = self.ref_counts[idx].saturating_add(1);
GenerationalString {
index: id.raw_index(),
generation: self.generation,
}
}
pub fn release(&mut self, handle: GenerationalString) {
if handle.generation != self.generation {
return;
}
let idx = handle.index as usize;
if idx < self.ref_counts.len() {
self.ref_counts[idx] = self.ref_counts[idx].saturating_sub(1);
}
}
pub fn resolve(&self, handle: GenerationalString) -> Option<&str> {
if handle.generation != self.generation {
return None;
}
self.pool.resolve(InternedString::from_raw(handle.index))
}
pub fn is_valid(&self, handle: GenerationalString) -> bool {
if handle.generation != self.generation {
return false;
}
let idx = handle.index as usize;
idx < self.ref_counts.len() && self.ref_counts[idx] > 0
}
pub fn compact(&mut self) -> usize {
let mut new_pool = StringPool::new();
let mut new_refs: Vec<u32> = Vec::new();
let mut removed = 0usize;
for (id, s) in self.pool.iter() {
let idx = id.raw_index() as usize;
let rc = if idx < self.ref_counts.len() {
self.ref_counts[idx]
} else {
0
};
if rc > 0 {
let new_id = new_pool.intern(s);
let new_idx = new_id.raw_index() as usize;
while new_refs.len() <= new_idx {
new_refs.push(0);
}
new_refs[new_idx] = rc;
} else {
removed += 1;
}
}
self.pool = new_pool;
self.ref_counts = new_refs;
self.generation = self.generation.wrapping_add(1);
removed
}
pub fn generation(&self) -> u16 {
self.generation
}
pub fn live_count(&self) -> usize {
self.ref_counts.iter().filter(|&&rc| rc > 0).count()
}
pub fn total_count(&self) -> usize {
self.pool.len()
}
}
#[allow(dead_code)]
pub struct FrequencyPool {
pool: StringPool,
counts: Vec<u64>,
}
#[allow(dead_code)]
impl FrequencyPool {
pub fn new() -> Self {
Self {
pool: StringPool::new(),
counts: Vec::new(),
}
}
pub fn intern(&mut self, s: &str) -> InternedString {
let id = self.pool.intern(s);
let idx = id.raw_index() as usize;
while self.counts.len() <= idx {
self.counts.push(0);
}
self.counts[idx] += 1;
id
}
pub fn frequency(&self, id: InternedString) -> u64 {
self.counts
.get(id.raw_index() as usize)
.copied()
.unwrap_or(0)
}
pub fn top_k(&self, k: usize) -> Vec<(InternedString, u64)> {
let mut pairs: Vec<(InternedString, u64)> = self
.pool
.iter()
.map(|(id, _)| (id, self.frequency(id)))
.collect();
pairs.sort_by_key(|b| std::cmp::Reverse(b.1));
pairs.truncate(k);
pairs
}
pub fn resolve(&self, id: InternedString) -> Option<&str> {
self.pool.resolve(id)
}
pub fn pool(&self) -> &StringPool {
&self.pool
}
pub fn total_calls(&self) -> u64 {
self.counts.iter().sum()
}
pub fn len(&self) -> usize {
self.pool.len()
}
pub fn is_empty(&self) -> bool {
self.pool.is_empty()
}
}
#[allow(dead_code)]
pub struct PoolSortedView {
entries: Vec<(InternedString, String)>,
}
#[allow(dead_code)]
impl PoolSortedView {
pub fn build(pool: &StringPool) -> Self {
let mut entries: Vec<(InternedString, String)> =
pool.iter().map(|(id, s)| (id, s.to_string())).collect();
entries.sort_by(|a, b| a.1.cmp(&b.1));
Self { entries }
}
pub fn iter(&self) -> impl Iterator<Item = (InternedString, &str)> {
self.entries.iter().map(|(id, s)| (*id, s.as_str()))
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
}