use std::collections::HashMap;
pub struct FastrieBuilderNode<V> {
built: bool,
children: HashMap<u8, FastrieBuilderNode<V>>,
index_width: IndexWidth,
value: Option<V>,
}
const RESERVED_BYTE: u8 = 0xFF;
const MAX_CLUSTER_GAP_LEN: i16 = 3;
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub struct IndexWidth(pub usize);
impl IndexWidth {
fn reserve_idx(self, vec: &mut Vec<u8>) -> usize {
let pos = vec.len();
for _ in 0..self.0 {
vec.push(RESERVED_BYTE);
};
pos
}
fn write_idx(self, vec: &mut Vec<u8>, pos: usize, mut idx: usize) -> () {
for i in 0..self.0 {
vec[pos + i] = idx as u8;
idx >>= 8;
};
}
fn push_idx(self, vec: &mut Vec<u8>, idx: usize) -> () {
let pos = self.reserve_idx(vec);
self.write_idx(vec, pos, idx);
}
fn read_idx(self, data: &[u8], pos: usize) -> usize {
let mut idx = 0usize;
for i in 0..self.0 {
idx |= (data[pos + i] as usize) << (8 * i);
}
idx
}
}
pub struct FastrieBuild<V> {
pub data: Vec<u8>,
pub index_width: IndexWidth,
pub values: Vec<V>,
}
impl<V> FastrieBuilderNode<V> {
pub fn new(index_width: IndexWidth) -> FastrieBuilderNode<V> {
FastrieBuilderNode {
built: false,
children: HashMap::new(),
index_width,
value: None,
}
}
pub fn add(&mut self, pattern: &[u8], value: V) -> () {
let mut current: &mut FastrieBuilderNode<V> = self;
for c in pattern {
if !current.children.contains_key(c) {
current.children.insert(*c, FastrieBuilderNode::new(current.index_width));
};
current = current.children.get_mut(c).unwrap();
};
current.value = Some(value);
}
fn _build(&mut self, data: &mut Vec<u8>, values: &mut Vec<V>) -> () {
assert!(!self.built);
self.built = true;
let value_idx: usize = if self.value.is_some() {
let idx = values.len() + 1;
values.push(self.value.take().unwrap());
idx
} else { 0 };
self.index_width.push_idx(data, value_idx);
let mut child_chars = self.children.keys().map(|k| *k).collect::<Vec<u8>>();
child_chars.sort();
let mut child_char_clusters: Vec<Vec<Option<u8>>> = vec![];
let mut last_char: i16 = std::i16::MIN;
for c in child_chars {
let p = c as i16;
if last_char + MAX_CLUSTER_GAP_LEN < p {
child_char_clusters.push(Vec::new());
} else {
for _ in last_char..p - 1 {
child_char_clusters.last_mut().unwrap().push(None);
};
};
child_char_clusters.last_mut().unwrap().push(Some(c));
last_char = p;
};
child_char_clusters.sort_by(|a, b| b.len().cmp(&a.len()));
let mut replace_with_child_indices: HashMap<u8, usize> = HashMap::new();
data.push(!self.children.is_empty() as u8);
let mut last_cluster_next_cluster_dist_pos: Option<usize> = None;
for cluster in &child_char_clusters {
let cluster_pos = data.len();
if let Some(out_pos) = last_cluster_next_cluster_dist_pos {
self.index_width.write_idx(data, out_pos, cluster_pos);
};
last_cluster_next_cluster_dist_pos = Some(self.index_width.reserve_idx(data));
let min = cluster.first().unwrap().unwrap();
let max = cluster.last().unwrap().unwrap();
data.push(min);
data.push(max);
for c in cluster {
match c {
Some(c) => {
debug_assert!(!replace_with_child_indices.contains_key(c));
replace_with_child_indices.insert(*c, self.index_width.reserve_idx(data));
}
None => { self.index_width.push_idx(data, 0); }
};
};
};
if let Some(out_pos) = last_cluster_next_cluster_dist_pos {
self.index_width.write_idx(data, out_pos, 0);
};
for cluster in &child_char_clusters {
for c in cluster {
if let Some(c) = c {
self.index_width.write_idx(data, *replace_with_child_indices.get(c).unwrap(), data.len());
let child_node = self.children.get_mut(c).unwrap();
child_node._build(data, values);
};
};
};
}
pub fn prebuild(&mut self) -> FastrieBuild<V> {
let mut data: Vec<u8> = Vec::new();
let mut values: Vec<V> = Vec::new();
self._build(&mut data, &mut values);
FastrieBuild {
data,
index_width: self.index_width,
values,
}
}
}
pub struct Fastrie<'v, 'd, V> {
data: &'d [u8],
index_width: IndexWidth,
values: Option<&'v [V]>,
}
pub struct FastrieMatch<'v, V> {
pub end: usize,
pub value: &'v V,
}
pub const fn from_prebuilt_without_values<'d>(index_width: IndexWidth, data: &'d [u8]) -> Fastrie<'_, 'd, ()> {
Fastrie {
data,
index_width,
values: None,
}
}
impl<V> Fastrie<'_, '_, V> {
pub const fn from_prebuilt<'v, 'd>(index_width: IndexWidth, values: &'v [V], data: &'d [u8]) -> Fastrie<'v, 'd, V> {
Fastrie {
data,
index_width,
values: Some(values),
}
}
pub fn memory_size(&self) -> usize {
self.data.len()
}
fn _longest_matching_prefix(&self, text: &[u8]) -> Option<(usize, usize)> {
let mut node_pos: usize = 0;
let mut match_opt: Option<(usize, usize)> = None;
'outer: for (i, &c) in text.iter().enumerate() {
let idx_bytes = self.index_width.0;
if self.data[node_pos + idx_bytes] == 0 {
break;
};
let mut cluster_pos: usize = node_pos + idx_bytes + 1;
loop {
let next_cluster_pos = self.index_width.read_idx(&self.data, cluster_pos);
let cluster_min: u8 = self.data[cluster_pos + idx_bytes];
let cluster_max: u8 = self.data[cluster_pos + idx_bytes + 1];
if c >= cluster_min && c <= cluster_max {
node_pos = self.index_width.read_idx(&self.data, cluster_pos + idx_bytes + 2 + ((c - cluster_min) as usize) * idx_bytes);
if node_pos == 0 {
break 'outer;
} else {
break;
};
};
if next_cluster_pos == 0 {
break 'outer;
};
cluster_pos = next_cluster_pos;
};
let node_value_idx: usize = self.index_width.read_idx(&self.data, node_pos);
if node_value_idx != 0 {
match_opt = Some((i, node_value_idx - 1));
};
};
match_opt
}
pub fn contains_key(&self, key: &[u8]) -> bool {
self._longest_matching_prefix(key).filter(|(i, _)| *i == key.len() - 1).is_some()
}
pub fn longest_matching_prefix(&self, text: &[u8]) -> Option<FastrieMatch<V>> {
self._longest_matching_prefix(text).map(|(end, value_idx)| FastrieMatch {
end,
value: &self.values.unwrap()[value_idx],
})
}
}