use crate::dynamic_dawg_char_zipper::DynamicDawgCharZipper;
use crate::iterator::DictionaryIterator;
use crate::sync_compat::RwLock;
use crate::value::DictionaryValue;
use crate::{Dictionary, DictionaryNode, SyncStrategy};
use rustc_hash::FxHashMap;
use smallvec::SmallVec;
use std::sync::Arc;
#[derive(Clone, Debug)]
pub struct DynamicDawgChar<V: DictionaryValue = ()> {
pub(crate) inner: Arc<RwLock<DynamicDawgCharInner<V>>>,
}
pub(crate) type DynamicDawgCharInner<V = ()> = crate::dawg_core::DawgCore<char, V>;
use crate::bloom_filter::BloomFilter;
pub(crate) type DawgNodeChar<V = ()> = crate::dawg_core::DawgNode<char, V>;
impl<V: DictionaryValue> DynamicDawgChar<V> {
pub fn new() -> Self {
Self::with_auto_minimize_threshold(f32::INFINITY)
}
pub fn with_auto_minimize_threshold(threshold: f32) -> Self {
Self::with_config(threshold, None)
}
pub fn with_config(auto_minimize_threshold: f32, bloom_filter_capacity: Option<usize>) -> Self {
let nodes = vec![DawgNodeChar::new(false)];
let bloom_filter = bloom_filter_capacity.map(BloomFilter::new);
DynamicDawgChar {
inner: Arc::new(RwLock::new(DynamicDawgCharInner {
nodes,
term_count: 0,
needs_compaction: false,
suffix_cache: FxHashMap::default(),
last_minimized_node_count: 1, auto_minimize_threshold,
bloom_filter,
})),
}
}
pub fn from_terms<I, S>(terms: I) -> Self
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let mut term_vec: Vec<String> = terms.into_iter().map(|s| s.as_ref().to_string()).collect();
term_vec.sort_unstable();
let dawg = Self::new();
for term in term_vec {
dawg.insert(&term);
}
dawg
}
pub fn from_sorted_terms<I, S>(terms: I) -> Self
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let dawg = Self::new();
for term in terms {
dawg.insert(term.as_ref());
}
dawg
}
pub fn from_terms_with_values<I, S>(entries: I) -> Self
where
I: IntoIterator<Item = (S, V)>,
S: AsRef<str>,
{
let mut pairs: Vec<(String, V)> = entries
.into_iter()
.map(|(s, v)| (s.as_ref().to_string(), v))
.collect();
pairs.sort_by(|a, b| a.0.cmp(&b.0));
let dawg = Self::new();
for (term, value) in pairs {
dawg.insert_with_value(&term, value);
}
dawg
}
pub fn insert(&self, term: &str) -> bool {
let mut inner = self.inner.write();
let chars: Vec<char> = term.chars().collect();
let inserted = inner.insert_units(&chars);
if inserted {
inner.bloom_insert(term);
}
inserted
}
pub fn insert_with_value(&self, term: &str, value: V) -> bool {
let mut inner = self.inner.write();
let chars: Vec<char> = term.chars().collect();
let inserted = inner.insert_units_with_value(&chars, value);
if inserted {
inner.bloom_insert(term);
}
inserted
}
pub fn update_or_insert<F>(&self, term: &str, default_value: V, update_fn: F) -> bool
where
F: FnOnce(&mut V),
{
let mut inner = self.inner.write();
let chars: Vec<char> = term.chars().collect();
let inserted = inner.update_or_insert_units(&chars, default_value, update_fn);
if inserted {
inner.bloom_insert(term);
}
inserted
}
pub fn get_value(&self, term: &str) -> Option<V> {
let inner = self.inner.read();
let chars: Vec<char> = term.chars().collect();
let mut node_idx = 0;
for &ch in &chars {
match inner.nodes[node_idx]
.edges
.iter()
.find(|(c, _)| *c == ch)
.map(|(_, idx)| idx)
{
Some(&child_idx) => node_idx = child_idx,
None => return None,
}
}
if inner.nodes[node_idx].is_final {
inner.nodes[node_idx].value.clone()
} else {
None
}
}
pub fn remove(&self, term: &str) -> bool {
let mut inner = self.inner.write();
let chars: Vec<char> = term.chars().collect();
inner.remove_units(&chars)
}
pub fn compact(&self) -> usize {
let mut inner = self.inner.write();
inner.compact()
}
pub fn minimize(&self) -> usize {
let mut inner = self.inner.write();
inner.minimize_incremental()
}
pub fn extend<I, S>(&self, terms: I) -> usize
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let mut term_vec: Vec<String> = terms.into_iter().map(|s| s.as_ref().to_string()).collect();
term_vec.sort_unstable();
let mut added = 0;
for term in term_vec {
if self.insert(&term) {
added += 1;
}
}
if added > 0 {
self.compact();
}
added
}
pub fn remove_many<I, S>(&self, terms: I) -> usize
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let mut removed = 0;
for term in terms {
if self.remove(term.as_ref()) {
removed += 1;
}
}
if removed > 0 {
self.compact();
}
removed
}
pub fn term_count(&self) -> usize {
self.inner.read().term_count
}
pub fn node_count(&self) -> usize {
self.inner.read().nodes.len()
}
pub fn needs_compaction(&self) -> bool {
self.inner.read().needs_compaction
}
pub fn contains(&self, term: &str) -> bool {
let inner = self.inner.read();
if let Some(ref bloom) = inner.bloom_filter {
if !bloom.might_contain(term) {
return false; }
}
drop(inner); let mut node = self.root();
for ch in term.chars() {
if let Some(next_node) = node.transition(ch) {
node = next_node;
} else {
return false;
}
}
node.is_final()
}
}
impl<V: DictionaryValue> DynamicDawgChar<V> {
pub fn iter_chars(&self) -> DictionaryIterator<DynamicDawgCharZipper<V>> {
let zipper = DynamicDawgCharZipper::new_from_dict(self);
DictionaryIterator::new(zipper)
}
pub fn iter(&self) -> impl Iterator<Item = (String, V)> + '_ {
self.iter_chars()
.map(|(chars, value)| (chars.into_iter().collect::<String>(), value))
}
}
impl<V: DictionaryValue> IntoIterator for &DynamicDawgChar<V> {
type Item = (Vec<char>, V);
type IntoIter = DictionaryIterator<DynamicDawgCharZipper<V>>;
fn into_iter(self) -> Self::IntoIter {
self.iter_chars()
}
}
impl<V: DictionaryValue> Default for DynamicDawgChar<V> {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "serialization")]
impl<V: DictionaryValue + serde::Serialize> serde::Serialize for DynamicDawgChar<V> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let inner = self.inner.read();
inner.serialize(serializer)
}
}
#[cfg(all(feature = "serialization", not(feature = "persistent-artrie")))]
impl<'de, V: DictionaryValue + serde::Deserialize<'de>> serde::Deserialize<'de>
for DynamicDawgChar<V>
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let inner = DynamicDawgCharInner::deserialize(deserializer)?;
Ok(DynamicDawgChar {
inner: Arc::new(RwLock::new(inner)),
})
}
}
#[cfg(all(feature = "serialization", feature = "persistent-artrie"))]
impl<'de, V: DictionaryValue> serde::Deserialize<'de> for DynamicDawgChar<V> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let inner = DynamicDawgCharInner::deserialize(deserializer)?;
Ok(DynamicDawgChar {
inner: Arc::new(RwLock::new(inner)),
})
}
}
impl<V: DictionaryValue> Dictionary for DynamicDawgChar<V> {
type Node = DynamicDawgCharNode<V>;
fn root(&self) -> Self::Node {
let inner = self.inner.read();
let node = &inner.nodes[0];
DynamicDawgCharNode {
dawg: Arc::clone(&self.inner),
node_idx: 0,
is_final: node.is_final,
edges: node.edges.clone(),
}
}
fn len(&self) -> Option<usize> {
Some(self.term_count())
}
fn sync_strategy(&self) -> SyncStrategy {
SyncStrategy::ExternalSync
}
}
#[derive(Clone)]
pub struct DynamicDawgCharNode<V: DictionaryValue = ()> {
dawg: Arc<RwLock<DynamicDawgCharInner<V>>>,
node_idx: usize,
is_final: bool,
edges: SmallVec<[(char, usize); 4]>,
}
impl<V: DictionaryValue> DictionaryNode for DynamicDawgCharNode<V> {
type Unit = char;
fn is_final(&self) -> bool {
self.is_final
}
fn transition(&self, label: char) -> Option<Self> {
let child_idx = if self.edges.len() < 16 {
self.edges
.iter()
.find(|(c, _)| *c == label)
.map(|(_, idx)| *idx)
} else {
self.edges
.binary_search_by_key(&label, |(c, _)| *c)
.ok()
.map(|i| self.edges[i].1)
}?;
let inner = self.dawg.read();
let child_node = &inner.nodes[child_idx];
Some(DynamicDawgCharNode {
dawg: Arc::clone(&self.dawg),
node_idx: child_idx,
is_final: child_node.is_final,
edges: child_node.edges.clone(),
})
}
fn edges(&self) -> Box<dyn Iterator<Item = (char, Self)> + '_> {
let inner = self.dawg.read();
let child_data: Vec<_> = self
.edges
.iter()
.map(|(ch, idx)| {
let child_node = &inner.nodes[*idx];
(*ch, *idx, child_node.is_final, child_node.edges.clone())
})
.collect();
drop(inner);
let dawg = Arc::clone(&self.dawg);
Box::new(
child_data
.into_iter()
.map(move |(ch, idx, is_final, edges)| {
(
ch,
DynamicDawgCharNode {
dawg: Arc::clone(&dawg),
node_idx: idx,
is_final,
edges,
},
)
}),
)
}
fn edge_count(&self) -> Option<usize> {
Some(self.edges.len())
}
}
use crate::{MappedDictionary, MappedDictionaryNode};
impl<V: DictionaryValue> MappedDictionaryNode for DynamicDawgCharNode<V> {
type Value = V;
fn value(&self) -> Option<Self::Value> {
let inner = self.dawg.read();
inner
.nodes
.get(self.node_idx)
.and_then(|node| node.value.clone())
}
}
impl<V: DictionaryValue> MappedDictionary for DynamicDawgChar<V> {
type Value = V;
fn get_value(&self, term: &str) -> Option<Self::Value> {
Self::get_value(self, term)
}
fn contains_with_value<F>(&self, term: &str, predicate: F) -> bool
where
F: Fn(&Self::Value) -> bool,
{
match self.get_value(term) {
Some(ref value) => predicate(value),
None => false,
}
}
}
impl<V: DictionaryValue> crate::MutableDictionary for DynamicDawgChar<V> {
fn insert(&self, term: &str) -> bool {
Self::insert(self, term)
}
fn remove(&self, term: &str) -> bool {
Self::remove(self, term)
}
fn extend<I, S>(&self, terms: I) -> usize
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
Self::extend(self, terms)
}
fn remove_many<I, S>(&self, terms: I) -> usize
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
Self::remove_many(self, terms)
}
}
impl<V: DictionaryValue> crate::CompactableDictionary for DynamicDawgChar<V> {
fn needs_compaction(&self) -> bool {
Self::needs_compaction(self)
}
fn compact(&self) -> usize {
Self::compact(self)
}
fn minimize(&self) -> usize {
Self::minimize(self)
}
}
impl<V: DictionaryValue> crate::MutableMappedDictionary for DynamicDawgChar<V> {
fn insert_with_value(&self, term: &str, value: Self::Value) -> bool {
Self::insert_with_value(self, term, value)
}
fn union_with<F>(&self, other: &Self, merge_fn: F) -> usize
where
F: Fn(&Self::Value, &Self::Value) -> Self::Value,
Self::Value: Clone,
{
let entries: Vec<(String, Option<Self::Value>)> = {
let other_inner = other.inner.read();
let mut out = Vec::new();
let mut stack: Vec<(usize, Vec<char>)> = vec![(0, Vec::new())];
while let Some((node_idx, path)) = stack.pop() {
let node = &other_inner.nodes[node_idx];
if node.is_final {
let term: String = path.iter().collect();
out.push((term, node.value.clone()));
}
for &(label, target_idx) in node.edges.iter().rev() {
let mut child_path = path.clone();
child_path.push(label);
stack.push((target_idx, child_path));
}
}
out
};
let mut processed = 0;
for (term, other_value) in entries {
processed += 1;
if let Some(other_value) = other_value {
if let Some(self_value) = self.get_value(&term) {
let merged = merge_fn(&self_value, &other_value);
self.insert_with_value(&term, merged);
} else {
self.insert_with_value(&term, other_value);
}
}
}
processed
}
fn update_or_insert<F>(&self, term: &str, default_value: Self::Value, update_fn: F) -> bool
where
F: FnOnce(&mut Self::Value),
{
Self::update_or_insert(self, term, default_value, update_fn)
}
}
#[cfg(test)]
mod tests {
use super::*;
use log::debug;
#[test]
fn test_dynamic_dawg_insert() {
let dawg: DynamicDawgChar<()> = DynamicDawgChar::new();
assert!(dawg.insert("test"));
assert!(!dawg.insert("test")); assert!(dawg.insert("testing"));
assert_eq!(dawg.term_count(), 2);
}
#[test]
fn test_dynamic_dawg_remove() {
let dawg: DynamicDawgChar<()> = DynamicDawgChar::new();
dawg.insert("test");
dawg.insert("testing");
dawg.insert("tested");
assert!(dawg.remove("testing"));
assert_eq!(dawg.term_count(), 2);
assert!(!dawg.remove("testing")); }
#[test]
fn test_dynamic_dawg_compact() {
let dawg: DynamicDawgChar<()> = DynamicDawgChar::new();
dawg.insert("test");
dawg.insert("testing");
dawg.insert("tested");
let before = dawg.node_count();
dawg.remove("testing");
let removed = dawg.compact();
let after = dawg.node_count();
assert!(removed > 0 || before == after);
assert_eq!(dawg.term_count(), 2);
}
#[test]
fn test_compaction_flag() {
let dawg: DynamicDawgChar<()> = DynamicDawgChar::new();
dawg.insert("test");
assert!(!dawg.needs_compaction());
dawg.remove("test");
assert!(dawg.needs_compaction());
dawg.compact();
assert!(!dawg.needs_compaction());
}
#[test]
fn test_batch_extend() {
let dawg: DynamicDawgChar<()> = DynamicDawgChar::new();
dawg.insert("test");
let new_terms = vec!["testing", "tested", "tester"];
let added = dawg.extend(new_terms);
assert_eq!(added, 3);
assert_eq!(dawg.term_count(), 4);
assert!(dawg.contains("test"));
assert!(dawg.contains("testing"));
}
#[test]
fn test_batch_remove_many() {
let dawg: DynamicDawgChar<()> =
DynamicDawgChar::from_terms(vec!["test", "testing", "tested", "tester"]);
let to_remove = vec!["testing", "tester"];
let removed = dawg.remove_many(to_remove);
assert_eq!(removed, 2);
assert_eq!(dawg.term_count(), 2);
assert!(dawg.contains("test"));
assert!(!dawg.contains("testing"));
}
#[test]
fn test_minimize_basic() {
let dawg: DynamicDawgChar<()> = DynamicDawgChar::new();
dawg.insert("zebra");
dawg.insert("apple");
dawg.insert("banana");
dawg.insert("apricot");
let nodes_before = dawg.node_count();
let merged = dawg.minimize();
let nodes_after = dawg.node_count();
assert_eq!(nodes_after, nodes_before - merged);
assert_eq!(dawg.term_count(), 4);
assert!(dawg.contains("zebra"));
assert!(dawg.contains("apple"));
assert!(dawg.contains("banana"));
assert!(dawg.contains("apricot"));
}
#[test]
fn test_minimize_vs_compact() {
let _terms = ["band", "banana", "bandana", "can", "cane", "candy"];
let dawg1: DynamicDawgChar<()> = DynamicDawgChar::new();
let dawg2: DynamicDawgChar<()> = DynamicDawgChar::new();
for term in ["zebra", "apple", "banana", "apricot", "band", "bandana"] {
dawg1.insert(term);
dawg2.insert(term);
}
let merged1 = dawg1.minimize();
let merged2 = dawg2.compact();
println!(
"After minimize: {} nodes (merged {})",
dawg1.node_count(),
merged1
);
println!(
"After compact: {} nodes (removed {})",
dawg2.node_count(),
merged2
);
for term in ["zebra", "apple", "banana", "apricot", "band", "bandana"] {
assert!(
dawg1.contains(term),
"minimize() DAWG missing term: {}",
term
);
assert!(
dawg2.contains(term),
"compact() DAWG missing term: {}",
term
);
}
assert_eq!(dawg1.term_count(), dawg2.term_count());
if dawg1.node_count() != dawg2.node_count() {
debug!(
"minimize() produced {} nodes, compact() produced {} nodes (expected difference)",
dawg1.node_count(),
dawg2.node_count()
);
}
}
#[test]
fn test_minimize_after_deletions() {
let dawg: DynamicDawgChar<()> =
DynamicDawgChar::from_terms(vec!["test", "testing", "tested", "tester", "testimony"]);
dawg.remove("testing");
dawg.remove("tester");
assert!(dawg.needs_compaction());
let nodes_before = dawg.node_count();
let merged = dawg.minimize();
let nodes_after = dawg.node_count();
assert!(merged > 0);
assert_eq!(nodes_after, nodes_before - merged);
assert!(dawg.contains("test"));
assert!(dawg.contains("tested"));
assert!(dawg.contains("testimony"));
assert!(!dawg.contains("testing"));
assert!(!dawg.contains("tester"));
}
#[test]
fn test_minimize_empty() {
let dawg: DynamicDawgChar<()> = DynamicDawgChar::new();
let merged = dawg.minimize();
assert_eq!(merged, 0);
assert_eq!(dawg.node_count(), 1); assert_eq!(dawg.term_count(), 0);
}
#[test]
fn test_minimize_single_term() {
let dawg: DynamicDawgChar<()> = DynamicDawgChar::new();
dawg.insert("hello");
let nodes_before = dawg.node_count();
let merged = dawg.minimize();
let nodes_after = dawg.node_count();
assert_eq!(merged, 0);
assert_eq!(nodes_before, nodes_after);
assert!(dawg.contains("hello"));
}
#[test]
fn test_minimize_with_shared_suffixes() {
let dawg: DynamicDawgChar<()> = DynamicDawgChar::new();
dawg.insert("testing");
dawg.insert("running");
dawg.insert("test");
dawg.insert("run");
let _merged = dawg.minimize();
assert!(dawg.contains("testing"));
assert!(dawg.contains("running"));
assert!(dawg.contains("test"));
assert!(dawg.contains("run"));
}
#[test]
fn test_minimize_idempotent() {
let dawg: DynamicDawgChar<()> =
DynamicDawgChar::from_terms(vec!["apple", "application", "apply", "apricot"]);
let _merged1 = dawg.minimize();
let nodes1 = dawg.node_count();
let merged2 = dawg.minimize();
let nodes2 = dawg.node_count();
assert_eq!(merged2, 0);
assert_eq!(nodes1, nodes2);
}
#[test]
fn test_minimize_no_false_positives() {
let dawg: DynamicDawgChar<()> = DynamicDawgChar::new();
let inserted_terms = vec!["zebra", "apple", "banana", "apricot", "band", "bandana"];
let not_inserted_terms = vec!["app", "ban", "zeb", "banan", "apric", "bandanas"];
for term in &inserted_terms {
dawg.insert(term);
}
dawg.minimize();
for term in &inserted_terms {
assert!(
dawg.contains(term),
"Should contain inserted term: {}",
term
);
}
for term in ¬_inserted_terms {
assert!(
!dawg.contains(term),
"Should NOT contain term that wasn't inserted: {}",
term
);
}
}
#[test]
fn test_valued_dawg_basic() {
let dawg: DynamicDawgChar<u32> = DynamicDawgChar::new();
assert!(dawg.insert_with_value("hello", 42));
assert!(dawg.insert_with_value("world", 100));
assert!(dawg.insert_with_value("test", 1));
assert_eq!(dawg.get_value("hello"), Some(42));
assert_eq!(dawg.get_value("world"), Some(100));
assert_eq!(dawg.get_value("test"), Some(1));
assert_eq!(dawg.get_value("unknown"), None);
assert!(!dawg.insert_with_value("hello", 999));
assert_eq!(dawg.get_value("hello"), Some(999));
assert_eq!(dawg.term_count(), 3);
}
#[test]
fn test_valued_dawg_with_remove() {
let dawg: DynamicDawgChar<String> = DynamicDawgChar::new();
dawg.insert_with_value("key1", "value1".to_string());
dawg.insert_with_value("key2", "value2".to_string());
assert_eq!(dawg.get_value("key1"), Some("value1".to_string()));
assert!(dawg.remove("key1"));
assert_eq!(dawg.get_value("key1"), None);
assert_eq!(dawg.get_value("key2"), Some("value2".to_string()));
}
#[test]
fn test_mapped_dictionary_trait() {
use crate::MappedDictionary;
let dawg: DynamicDawgChar<Vec<u32>> = DynamicDawgChar::new();
dawg.insert_with_value("scoped", vec![1, 2, 3]);
dawg.insert_with_value("global", vec![0]);
assert_eq!(dawg.get_value("scoped"), Some(vec![1, 2, 3]));
assert!(dawg.contains_with_value("scoped", |v| v.contains(&2)));
assert!(!dawg.contains_with_value("scoped", |v| v.contains(&999)));
assert!(!dawg.contains_with_value("unknown", |v| v.contains(&1)));
}
#[test]
fn test_compact_no_false_positives() {
let dawg: DynamicDawgChar<()> = DynamicDawgChar::new();
let inserted_terms = vec!["zebra", "apple", "banana", "apricot", "band", "bandana"];
let not_inserted_terms = vec!["app", "ban", "zeb", "banan", "apric", "bandanas"];
for term in &inserted_terms {
dawg.insert(term);
}
dawg.compact();
for term in &inserted_terms {
assert!(
dawg.contains(term),
"Should contain inserted term: {}",
term
);
}
for term in ¬_inserted_terms {
assert!(
!dawg.contains(term),
"Should NOT contain term that wasn't inserted: {}",
term
);
}
}
}