use std::collections::HashMap;
use std::sync::Arc;
use crate::sync_compat::RwLock;
use crate::iterator::{DictionaryIterator, DictionaryTermIterator};
use crate::suffix_automaton_zipper::SuffixAutomatonZipper;
use crate::value::DictionaryValue;
use crate::{Dictionary, DictionaryNode, SyncStrategy};
pub(crate) type SuffixNode<V = ()> = crate::suffix_automaton_core::SuffixNode<u8, V>;
pub(crate) type SuffixAutomatonInner<V = ()> =
crate::suffix_automaton_core::SuffixAutomatonInner<u8, V>;
#[allow(dead_code)]
mod _legacy_extend_byte {
}
#[derive(Clone, Debug)]
pub struct SuffixAutomaton<V: DictionaryValue = ()> {
pub(crate) inner: Arc<RwLock<SuffixAutomatonInner<V>>>,
}
impl<V: DictionaryValue> SuffixAutomaton<V> {
pub fn new() -> Self {
Self {
inner: Arc::new(RwLock::new(SuffixAutomatonInner::new())),
}
}
pub fn state_count(&self) -> usize {
self.inner.read().nodes.len()
}
#[allow(dead_code)]
pub fn debug_print(&self) {
let inner = self.inner.read();
println!("Suffix Automaton with {} states:", inner.nodes.len());
for (idx, node) in inner.nodes.iter().enumerate() {
println!(
" State {}: is_final={}, max_len={}, edges={:?}, link={:?}",
idx,
node.is_final,
node.max_length,
node.edges
.iter()
.map(|(b, t)| (char::from(*b), t))
.collect::<Vec<_>>(),
node.suffix_link
);
}
}
pub fn from_text(text: &str) -> Self {
let dict = Self::new();
dict.insert(text);
dict
}
pub fn from_texts<I, S>(texts: I) -> Self
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let dict = Self::new();
for text in texts {
dict.insert(text.as_ref());
}
dict
}
pub fn insert(&self, text: &str) -> bool {
let mut inner = self.inner.write();
let string_id = inner.string_count;
inner.source_texts.push(text.to_string());
for ch in text.bytes() {
inner.extend(ch);
}
let last_state = inner.last_state;
inner
.positions
.entry(last_state)
.or_default()
.push((string_id, text.len()));
inner.string_count += 1;
inner.last_state = 0;
true
}
pub fn remove(&self, text: &str) -> bool {
let mut inner = self.inner.write();
let mut state = 0;
for ch in text.bytes() {
match inner.nodes[state].find_edge(ch) {
Some(next) => state = next,
None => return false, }
}
let removed = if let Some(positions) = inner.positions.get_mut(&state) {
let original_len = positions.len();
positions.retain(|(_, end)| *end != text.len());
positions.len() < original_len
} else {
false
};
if removed {
let should_remove = inner
.positions
.get(&state)
.map(|v| v.is_empty())
.unwrap_or(false);
if should_remove {
inner.positions.remove(&state);
}
inner.needs_compaction = true;
inner.string_count -= 1;
true
} else {
false
}
}
pub fn clear(&self) {
let mut inner = self.inner.write();
*inner = SuffixAutomatonInner::new();
}
pub fn compact(&self) {
let mut inner = self.inner.write();
if !inner.needs_compaction {
return;
}
let mut reachable = vec![false; inner.nodes.len()];
let mut stack = vec![0];
while let Some(state) = stack.pop() {
if reachable[state] {
continue;
}
reachable[state] = true;
for &(_, target) in &inner.nodes[state].edges {
stack.push(target);
}
}
let mut new_nodes = Vec::new();
let mut old_to_new = vec![0; inner.nodes.len()];
for (old_idx, node) in inner.nodes.iter().enumerate() {
if reachable[old_idx] {
old_to_new[old_idx] = new_nodes.len();
new_nodes.push(node.clone());
}
}
for node in &mut new_nodes {
for edge in &mut node.edges {
edge.1 = old_to_new[edge.1];
}
if let Some(link) = node.suffix_link {
node.suffix_link = Some(old_to_new[link]);
}
}
let mut new_positions = HashMap::new();
for (old_state, positions) in inner.positions.drain() {
if reachable[old_state] {
new_positions.insert(old_to_new[old_state], positions);
}
}
inner.nodes = new_nodes;
inner.positions = new_positions;
inner.last_state = 0;
inner.needs_compaction = false;
}
pub fn string_count(&self) -> usize {
self.inner.read().string_count
}
pub fn needs_compaction(&self) -> bool {
self.inner.read().needs_compaction
}
pub fn match_positions(&self, substring: &str) -> Vec<(usize, usize)> {
let inner = self.inner.read();
let mut state = 0;
for byte in substring.as_bytes() {
match inner.nodes[state].find_edge(*byte) {
Some(next) => state = next,
None => return Vec::new(), }
}
let mut result = Vec::new();
if let Some(positions) = inner.positions.get(&state) {
result.extend(positions.iter().copied());
}
result
}
pub fn update_or_insert<F>(&self, term: &str, default_value: V, update_fn: F) -> bool
where
F: FnOnce(&mut V),
{
let mut inner = self.inner.write();
let mut state = 0;
for &byte in term.as_bytes() {
match inner.nodes[state].find_edge(byte) {
Some(next) => state = next,
None => {
drop(inner);
return self.insert_with_value_internal(term, default_value);
}
}
}
if inner.nodes[state].value.is_some() {
update_fn(
inner.nodes[state]
.value
.as_mut()
.expect("value.is_some() checked one line above"),
);
false
} else {
inner.nodes[state].value = Some(default_value);
inner.nodes[state].is_final = true;
true
}
}
fn insert_with_value_internal(&self, term: &str, value: V) -> bool {
let mut inner = self.inner.write();
inner.last_state = 0;
for &byte in term.as_bytes() {
inner.extend(byte);
}
let final_state = inner.last_state;
inner.nodes[final_state].value = Some(value);
inner.string_count += 1;
inner.source_texts.push(term.to_string());
inner.last_state = 0;
true
}
pub fn source_texts(&self) -> Vec<String> {
let inner = self.inner.read();
inner.source_texts.clone()
}
pub fn iter_terms(&self) -> DictionaryTermIterator<SuffixAutomatonZipper<V>> {
let zipper = SuffixAutomatonZipper::new_from_dict(self);
DictionaryTermIterator::new(zipper)
}
pub fn iter_bytes(&self) -> DictionaryIterator<SuffixAutomatonZipper<V>> {
let zipper = SuffixAutomatonZipper::new_from_dict(self);
DictionaryIterator::new(zipper)
}
pub fn iter(&self) -> impl Iterator<Item = (String, V)> + '_ {
self.iter_bytes()
.map(|(bytes, value)| (String::from_utf8_lossy(&bytes).into_owned(), value))
}
}
impl<V: DictionaryValue> IntoIterator for &SuffixAutomaton<V> {
type Item = (Vec<u8>, V);
type IntoIter = DictionaryIterator<SuffixAutomatonZipper<V>>;
fn into_iter(self) -> Self::IntoIter {
self.iter_bytes()
}
}
impl<V: DictionaryValue> Default for SuffixAutomaton<V> {
fn default() -> Self {
Self::new()
}
}
#[cfg(feature = "serialization")]
impl<V: DictionaryValue + serde::Serialize> serde::Serialize for SuffixAutomaton<V> {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
let inner = self.inner.read();
inner.serialize(serializer)
}
}
#[cfg(all(feature = "serialization", not(feature = "persistent-artrie")))]
impl<'de, V: DictionaryValue + serde::Deserialize<'de>> serde::Deserialize<'de>
for SuffixAutomaton<V>
{
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let inner = SuffixAutomatonInner::deserialize(deserializer)?;
Ok(SuffixAutomaton {
inner: Arc::new(RwLock::new(inner)),
})
}
}
#[cfg(all(feature = "serialization", feature = "persistent-artrie"))]
impl<'de, V: DictionaryValue> serde::Deserialize<'de> for SuffixAutomaton<V> {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let inner = SuffixAutomatonInner::deserialize(deserializer)?;
Ok(SuffixAutomaton {
inner: Arc::new(RwLock::new(inner)),
})
}
}
#[derive(Clone, Debug)]
pub struct SuffixNodeHandle<V: DictionaryValue = ()> {
automaton: Arc<RwLock<SuffixAutomatonInner<V>>>,
state_id: usize,
}
impl<V: DictionaryValue> DictionaryNode for SuffixNodeHandle<V> {
type Unit = u8;
fn is_final(&self) -> bool {
let inner = self.automaton.read();
inner.nodes[self.state_id].is_final
}
fn transition(&self, label: u8) -> Option<Self> {
let inner = self.automaton.read();
inner.nodes[self.state_id]
.find_edge(label)
.map(|target| Self {
automaton: Arc::clone(&self.automaton),
state_id: target,
})
}
fn edges(&self) -> Box<dyn Iterator<Item = (u8, Self)> + '_> {
let inner = self.automaton.read();
let edges = inner.nodes[self.state_id].edges.clone();
drop(inner);
Box::new(edges.into_iter().map(move |(label, target)| {
(
label,
Self {
automaton: Arc::clone(&self.automaton),
state_id: target,
},
)
}))
}
fn has_edge(&self, label: u8) -> bool {
let inner = self.automaton.read();
inner.nodes[self.state_id].find_edge(label).is_some()
}
fn edge_count(&self) -> Option<usize> {
let inner = self.automaton.read();
Some(inner.nodes[self.state_id].edges.len())
}
}
impl<V: DictionaryValue> Dictionary for SuffixAutomaton<V> {
type Node = SuffixNodeHandle<V>;
fn root(&self) -> Self::Node {
SuffixNodeHandle {
automaton: Arc::clone(&self.inner),
state_id: 0,
}
}
fn contains(&self, term: &str) -> bool {
let mut node = self.root();
for byte in term.as_bytes() {
match node.transition(*byte) {
Some(next) => node = next,
None => return false,
}
}
true
}
fn len(&self) -> Option<usize> {
Some(self.string_count())
}
fn sync_strategy(&self) -> SyncStrategy {
SyncStrategy::ExternalSync }
fn is_suffix_based(&self) -> bool {
true }
}
use crate::{MappedDictionary, MappedDictionaryNode, MutableMappedDictionary};
impl<V: DictionaryValue> MappedDictionaryNode for SuffixNodeHandle<V> {
type Value = V;
fn value(&self) -> Option<Self::Value> {
let inner = self.automaton.read();
inner
.nodes
.get(self.state_id)
.and_then(|node| node.value.clone())
}
}
impl<V: DictionaryValue> MappedDictionary for SuffixAutomaton<V> {
type Value = V;
fn get_value(&self, term: &str) -> Option<Self::Value> {
let mut node = self.root();
for byte in term.as_bytes() {
match node.transition(*byte) {
Some(next) => node = next,
None => return None,
}
}
node.value()
}
fn contains_with_value<F>(&self, term: &str, predicate: F) -> bool
where
F: Fn(&Self::Value) -> bool,
{
match self.get_value(term) {
Some(ref value) => predicate(value),
None => false,
}
}
}
impl<V: DictionaryValue> MutableMappedDictionary for SuffixAutomaton<V> {
fn insert_with_value(&self, term: &str, value: Self::Value) -> bool {
self.insert_with_value_internal(term, value)
}
fn update_or_insert<F>(&self, term: &str, default_value: Self::Value, update_fn: F) -> bool
where
F: FnOnce(&mut Self::Value),
{
SuffixAutomaton::update_or_insert(self, term, default_value, update_fn)
}
fn union_with<F>(&self, other: &Self, merge_fn: F) -> usize
where
F: Fn(&Self::Value, &Self::Value) -> Self::Value,
Self::Value: Clone,
{
let mut processed = 0;
for term in other.source_texts() {
if term.is_empty() {
continue; }
if let Some(other_value) = other.get_value(&term) {
processed += 1;
let new_value = if let Some(self_value) = self.get_value(&term) {
merge_fn(&self_value, &other_value)
} else {
other_value.clone()
};
let new_value_clone = new_value.clone();
self.update_or_insert(&term, new_value, move |v| *v = new_value_clone);
}
}
processed
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_automaton() {
let dict = SuffixAutomaton::<()>::new();
assert_eq!(dict.string_count(), 0);
assert!(!dict.needs_compaction());
}
#[test]
fn test_single_character() {
let dict = SuffixAutomaton::<()>::from_text("a");
assert_eq!(dict.string_count(), 1);
assert!(dict.contains("a"));
assert!(!dict.contains("b"));
}
#[test]
fn test_simple_string() {
let dict = SuffixAutomaton::<()>::from_text("abc");
assert_eq!(dict.string_count(), 1);
assert!(dict.contains("abc"));
assert!(dict.contains("bc"));
assert!(dict.contains("c"));
assert!(dict.contains("ab"));
assert!(dict.contains("b"));
assert!(dict.contains("a"));
assert!(!dict.contains("d"));
assert!(!dict.contains("abcd"));
}
#[test]
fn test_repeated_characters() {
let dict = SuffixAutomaton::<()>::from_text("aaa");
assert_eq!(dict.string_count(), 1);
assert!(dict.contains("aaa"));
assert!(dict.contains("aa"));
assert!(dict.contains("a"));
}
#[test]
fn test_complex_string() {
let dict = SuffixAutomaton::<()>::from_text("abcbc");
assert_eq!(dict.string_count(), 1);
assert!(dict.contains("abcbc"));
assert!(dict.contains("bcbc"));
assert!(dict.contains("cbc"));
assert!(dict.contains("bc"));
assert!(dict.contains("c"));
assert!(dict.contains("abc"));
assert!(dict.contains("bcb"));
}
#[test]
fn test_multiple_strings() {
let dict = SuffixAutomaton::<()>::from_texts(vec!["abc", "def"]);
assert_eq!(dict.string_count(), 2);
assert!(dict.contains("abc"));
assert!(dict.contains("bc"));
assert!(dict.contains("c"));
assert!(dict.contains("def"));
assert!(dict.contains("ef"));
assert!(dict.contains("f"));
}
#[test]
fn test_insert_and_remove() {
let dict = SuffixAutomaton::<()>::new();
assert!(dict.insert("test"));
assert_eq!(dict.string_count(), 1);
assert!(dict.contains("test"));
assert!(dict.remove("test"));
assert_eq!(dict.string_count(), 0);
assert!(dict.needs_compaction());
assert!(!dict.remove("test")); }
#[test]
fn test_clear() {
let dict = SuffixAutomaton::<()>::from_texts(vec!["abc", "def", "ghi"]);
assert_eq!(dict.string_count(), 3);
dict.clear();
assert_eq!(dict.string_count(), 0);
assert!(!dict.contains("abc"));
}
#[test]
fn test_compaction() {
let dict = SuffixAutomaton::<()>::new();
dict.insert("test1");
dict.insert("test2");
dict.insert("test3");
assert_eq!(dict.string_count(), 3);
dict.remove("test2");
assert_eq!(dict.string_count(), 2);
assert!(dict.needs_compaction());
dict.compact();
assert!(!dict.needs_compaction());
assert_eq!(dict.string_count(), 2);
assert!(dict.contains("test1"));
assert!(dict.contains("test3"));
}
#[test]
fn test_match_positions() {
let docs = vec!["hello", "world"];
let dict = SuffixAutomaton::<()>::from_texts(docs);
let positions_hello = dict.match_positions("hello");
assert!(!positions_hello.is_empty());
assert_eq!(positions_hello[0].0, 0);
let positions_world = dict.match_positions("world");
assert!(!positions_world.is_empty());
assert_eq!(positions_world[0].0, 1);
let positions_ello = dict.match_positions("ello");
assert!(!positions_ello.is_empty());
}
#[test]
fn test_dictionary_trait() {
let dict = SuffixAutomaton::<()>::from_text("test");
assert_eq!(dict.len(), Some(1));
assert!(!dict.is_empty());
assert_eq!(dict.sync_strategy(), SyncStrategy::ExternalSync);
let root = dict.root();
assert!(root.has_edge(b't'));
let node_t = root.transition(b't').unwrap();
assert!(node_t.has_edge(b'e'));
}
#[test]
fn test_node_edges() {
let dict = SuffixAutomaton::<()>::from_text("ab");
let root = dict.root();
let edges: Vec<_> = root.edges().collect();
assert!(!edges.is_empty());
let labels: Vec<_> = edges.iter().map(|(l, _)| *l).collect();
assert!(labels.contains(&b'a') || labels.contains(&b'b'));
}
#[test]
fn test_mapped_dictionary_basic() {
use crate::MappedDictionary;
let dict: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict.insert_with_value("test", 42);
dict.insert_with_value("hello", 100);
assert_eq!(dict.get_value("test"), Some(42));
assert_eq!(dict.get_value("hello"), Some(100));
assert_eq!(dict.get_value("missing"), None);
}
#[test]
fn test_mapped_dictionary_contains_with_value() {
use crate::MappedDictionary;
let dict: SuffixAutomaton<String> = SuffixAutomaton::new();
dict.insert_with_value("test", "value1".to_string());
dict.insert_with_value("hello", "value2".to_string());
assert!(dict.contains_with_value("test", |v| v == "value1"));
assert!(!dict.contains_with_value("test", |v| v == "wrong"));
assert!(!dict.contains_with_value("missing", |v| v == "value1"));
}
#[test]
fn test_mapped_dictionary_vec_values() {
use crate::MappedDictionary;
let dict: SuffixAutomaton<Vec<usize>> = SuffixAutomaton::new();
dict.insert_with_value("scoped", vec![1, 2, 3]);
dict.insert_with_value("global", vec![0]);
assert_eq!(dict.get_value("scoped"), Some(vec![1, 2, 3]));
assert!(dict.contains_with_value("scoped", |v| v.contains(&2)));
assert!(!dict.contains_with_value("scoped", |v| v.contains(&999)));
}
#[test]
fn test_mapped_node_value() {
use crate::MappedDictionaryNode;
let dict: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict.insert_with_value("test", 42);
let root = dict.root();
let t = root.transition(b't').unwrap();
let e = t.transition(b'e').unwrap();
let s = e.transition(b's').unwrap();
let t2 = s.transition(b't').unwrap();
assert_eq!(t2.value(), Some(42));
assert_eq!(t.value(), None);
}
#[test]
fn test_union_with_both_empty() {
let dict1: SuffixAutomaton<u32> = SuffixAutomaton::new();
let dict2: SuffixAutomaton<u32> = SuffixAutomaton::new();
let processed = dict1.union_with(&dict2, |a, b| a + b);
assert_eq!(processed, 0);
assert_eq!(dict1.string_count(), 0);
}
#[test]
fn test_union_with_self_empty() {
let dict1: SuffixAutomaton<u32> = SuffixAutomaton::new();
let dict2: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict2.insert_with_value("hello", 10);
dict2.insert_with_value("world", 20);
let processed = dict1.union_with(&dict2, |a, b| a + b);
assert!(processed > 0);
assert_eq!(dict1.get_value("hello"), Some(10));
assert_eq!(dict1.get_value("world"), Some(20));
}
#[test]
fn test_union_with_other_empty() {
let dict1: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict1.insert_with_value("hello", 10);
let dict2: SuffixAutomaton<u32> = SuffixAutomaton::new();
let processed = dict1.union_with(&dict2, |a, b| a + b);
assert_eq!(processed, 0);
assert_eq!(dict1.get_value("hello"), Some(10));
}
#[test]
fn test_union_with_no_conflicts() {
let dict1: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict1.insert_with_value("hello", 10);
let dict2: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict2.insert_with_value("world", 20);
let processed = dict1.union_with(&dict2, |a, b| a + b);
assert!(processed > 0);
assert_eq!(dict1.get_value("hello"), Some(10));
assert_eq!(dict1.get_value("world"), Some(20));
}
#[test]
fn test_union_with_conflicts_sum() {
let dict1: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict1.insert_with_value("hello", 10);
let dict2: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict2.insert_with_value("hello", 20);
let processed = dict1.union_with(&dict2, |a, b| a + b);
assert!(processed > 0);
assert_eq!(dict1.get_value("hello"), Some(30));
}
#[test]
fn test_union_with_conflicts_max() {
let dict1: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict1.insert_with_value("hello", 10);
let dict2: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict2.insert_with_value("hello", 20);
let processed = dict1.union_with(&dict2, |a, b| *a.max(b));
assert!(processed > 0);
assert_eq!(dict1.get_value("hello"), Some(20));
}
#[test]
fn test_union_with_partial_conflicts() {
let dict1: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict1.insert_with_value("apple", 1);
dict1.insert_with_value("banana", 2);
let dict2: SuffixAutomaton<u32> = SuffixAutomaton::new();
dict2.insert_with_value("banana", 3);
dict2.insert_with_value("cherry", 4);
let processed = dict1.union_with(&dict2, |a, b| a + b);
assert!(processed > 0);
assert_eq!(dict1.get_value("apple"), Some(1));
assert_eq!(dict1.get_value("banana"), Some(5));
assert_eq!(dict1.get_value("cherry"), Some(4));
}
}