use std::sync::Arc;
use crate::substring::{BidirectionalDictionaryNode, SubstringDictionary, SubstringMatch};
use crate::sync_compat::RwLock;
use crate::value::DictionaryValue;
use crate::{Dictionary, DictionaryNode};
const NIL: usize = usize::MAX;
#[allow(dead_code)]
const END_MARKER_BASE: u8 = 0x01;
type ScdawgNode<V = ()> = crate::scdawg_core::ScdawgNode<u8, V>;
type ScdawgInner<V = ()> = crate::scdawg_core::ScdawgCoreInner<u8, V>;
#[derive(Clone, Debug)]
pub struct Scdawg<V: DictionaryValue = ()> {
inner: Arc<RwLock<ScdawgInner<V>>>,
}
impl<V: DictionaryValue> Default for Scdawg<V> {
fn default() -> Self {
Self::new()
}
}
impl<V: DictionaryValue> Scdawg<V> {
pub fn new() -> Self {
Self {
inner: Arc::new(RwLock::new(ScdawgInner::new())),
}
}
pub fn from_terms<I, S>(terms: I) -> Self
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let terms_vec: Vec<S> = terms.into_iter().collect();
let term_count = terms_vec.len();
let total_chars: usize = terms_vec.iter().map(|s| s.as_ref().len()).sum();
let inner = ScdawgInner::with_capacity(term_count, total_chars);
let scdawg = Self {
inner: Arc::new(RwLock::new(inner)),
};
{
let mut inner = scdawg.inner.write();
for term in terms_vec {
inner.insert(term.as_ref());
}
inner.compute_left_edges();
}
scdawg
}
pub fn from_terms_with_values<I, S>(entries: I) -> Self
where
I: IntoIterator<Item = (S, V)>,
S: AsRef<str>,
{
let pairs: Vec<(String, V)> = entries
.into_iter()
.map(|(s, v)| (s.as_ref().to_string(), v))
.collect();
let total_chars: usize = pairs.iter().map(|(s, _)| s.len()).sum();
let inner = ScdawgInner::with_capacity(pairs.len(), total_chars);
let scdawg = Self {
inner: Arc::new(RwLock::new(inner)),
};
{
let mut inner = scdawg.inner.write();
for (term, value) in pairs {
inner.insert_with_value(&term, value);
}
inner.compute_left_edges();
}
scdawg
}
pub fn insert(&self, term: &str) -> bool {
let mut inner = self.inner.write();
let result = inner.insert(term);
if result {
inner.compute_left_edges();
}
result
}
pub fn insert_with_value(&self, term: &str, value: V) -> bool {
let mut inner = self.inner.write();
let result = inner.insert_with_value(term, value);
if result {
inner.compute_left_edges();
}
result
}
pub fn get_value(&self, term: &str) -> Option<V>
where
V: Clone,
{
let inner = self.inner.read();
if let Some(value) = inner.term_values.get(term) {
return Some(value.clone());
}
let mut current = 0;
for byte in term.bytes() {
match inner.nodes[current].get_edge(byte) {
Some(next) => current = next,
None => return None,
}
}
if inner.nodes[current].is_final {
inner.nodes[current].value.clone()
} else {
None
}
}
pub fn contains_substring(&self, pattern: &str) -> bool {
let inner = self.inner.read();
inner.contains_substring(pattern)
}
pub fn iter(&self) -> impl Iterator<Item = String> {
let inner = self.inner.read();
inner.terms.clone().into_iter()
}
pub fn term_count(&self) -> usize {
self.inner.read().term_count()
}
pub fn find(&self, pattern: &str) -> Option<ScdawgNodeHandle<V>> {
let inner = self.inner.read();
inner
.find_substring_fast(pattern)
.map(|node_idx| ScdawgNodeHandle {
inner: Arc::clone(&self.inner),
node_idx,
})
}
pub fn freq(&self, pattern: &str) -> usize {
let inner = self.inner.read();
inner.frequency(pattern)
}
pub fn freq_at(&self, handle: &ScdawgNodeHandle<V>) -> usize {
let inner = self.inner.read();
let mut count = 0;
inner.count_occurrences(handle.node_idx, &mut count);
count
}
pub fn locations(&self, pattern: &str) -> Vec<(String, usize)> {
let inner = self.inner.read();
inner.find_exact_substring(pattern)
}
pub fn locations_at(
&self,
handle: &ScdawgNodeHandle<V>,
pattern_len: usize,
) -> Vec<(String, usize)> {
let inner = self.inner.read();
let mut results = Vec::new();
inner.collect_term_positions(handle.node_idx, pattern_len, &mut results);
results
}
}
impl<V: DictionaryValue> Dictionary for Scdawg<V> {
type Node = ScdawgNodeHandle<V>;
fn len(&self) -> Option<usize> {
Some(self.inner.read().term_count())
}
fn contains(&self, term: &str) -> bool {
self.inner.read().contains(term)
}
fn root(&self) -> Self::Node {
ScdawgNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: 0,
}
}
fn sync_strategy(&self) -> crate::SyncStrategy {
crate::SyncStrategy::ExternalSync
}
}
impl<V: DictionaryValue> crate::MappedDictionary for Scdawg<V> {
type Value = V;
fn get_value(&self, term: &str) -> Option<Self::Value> {
Self::get_value(self, term)
}
}
#[derive(Clone)]
pub struct ScdawgNodeHandle<V: DictionaryValue = ()> {
inner: Arc<RwLock<ScdawgInner<V>>>,
node_idx: usize,
}
impl<V: DictionaryValue> std::fmt::Debug for ScdawgNodeHandle<V> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ScdawgNodeHandle")
.field("node_idx", &self.node_idx)
.finish()
}
}
impl<V: DictionaryValue> DictionaryNode for ScdawgNodeHandle<V> {
type Unit = u8;
fn is_final(&self) -> bool {
let inner = self.inner.read();
inner.nodes[self.node_idx].is_final
}
fn transition(&self, label: u8) -> Option<Self> {
let inner = self.inner.read();
inner.nodes[self.node_idx]
.get_edge(label)
.map(|idx| ScdawgNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: idx,
})
}
fn edges(&self) -> Box<dyn Iterator<Item = (u8, Self)> + '_> {
let inner = self.inner.read();
let edges: Vec<_> = inner.nodes[self.node_idx]
.forward_edges
.iter()
.map(|&(label, idx)| {
(
label,
ScdawgNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: idx,
},
)
})
.collect();
Box::new(edges.into_iter())
}
fn edge_count(&self) -> Option<usize> {
let inner = self.inner.read();
Some(inner.nodes[self.node_idx].forward_edges.len())
}
}
unsafe impl<V: DictionaryValue> Send for ScdawgNodeHandle<V> {}
unsafe impl<V: DictionaryValue> Sync for ScdawgNodeHandle<V> {}
impl<V: DictionaryValue> BidirectionalDictionaryNode for ScdawgNodeHandle<V> {
fn parent(&self) -> Option<Self> {
let inner = self.inner.read();
let node = &inner.nodes[self.node_idx];
if node.parent == NIL {
None
} else {
Some(ScdawgNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: node.parent,
})
}
}
fn parent_label(&self) -> Option<u8> {
let inner = self.inner.read();
let node = &inner.nodes[self.node_idx];
if node.parent == NIL {
None
} else {
Some(node.parent_label)
}
}
fn reverse_edges(&self) -> Box<dyn Iterator<Item = (u8, Self)> + '_> {
let inner = self.inner.read();
let edges: Vec<_> = inner.nodes[self.node_idx]
.left_edges
.iter()
.map(|&(label, idx)| {
(
label,
ScdawgNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: idx,
},
)
})
.collect();
Box::new(edges.into_iter())
}
fn reverse_transition(&self, label: u8) -> Vec<Self> {
let inner = self.inner.read();
inner.nodes[self.node_idx]
.left_edges
.iter()
.filter(|(l, _)| *l == label)
.map(|(_, idx)| ScdawgNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: *idx,
})
.collect()
}
fn depth(&self) -> usize {
let inner = self.inner.read();
inner.nodes[self.node_idx].depth
}
}
impl<V: DictionaryValue> SubstringDictionary for Scdawg<V> {
fn find_exact_substring(&self, pattern: &str) -> Vec<SubstringMatch<Self::Node>> {
let inner = self.inner.read();
let occurrences = inner.find_exact_substring(pattern);
occurrences
.into_iter()
.map(|(term, position)| {
let mut node_idx = 0;
for &byte in term.as_bytes().iter().take(position + pattern.len()) {
if let Some(next) = inner.nodes[node_idx].get_edge(byte) {
node_idx = next;
}
}
SubstringMatch::new(
ScdawgNodeHandle {
inner: Arc::clone(&self.inner),
node_idx,
},
term,
position,
pattern.len(),
)
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
use log::debug;
#[test]
fn test_scdawg_empty() {
let scdawg = Scdawg::<()>::new();
assert_eq!(scdawg.term_count(), 0);
assert!(!scdawg.contains("anything"));
}
#[test]
fn test_scdawg_insert_single() {
let scdawg = Scdawg::<()>::new();
assert!(scdawg.insert("hello"));
assert!(!scdawg.insert("hello")); assert_eq!(scdawg.term_count(), 1);
assert!(scdawg.contains("hello"));
}
#[test]
fn test_scdawg_substring_search() {
let scdawg = Scdawg::<()>::from_terms(vec!["cathedral", "category", "catering"]);
assert!(scdawg.contains_substring("cat"));
assert!(scdawg.contains_substring("the"));
assert!(scdawg.contains_substring("edral"));
assert!(scdawg.contains_substring("gory"));
assert!(!scdawg.contains_substring("xyz"));
}
#[test]
fn test_scdawg_find_exact_substring() {
let scdawg = Scdawg::<()>::from_terms(vec!["hello", "world"]);
let matches = scdawg.find_exact_substring("hello");
assert!(!matches.is_empty());
assert!(matches.iter().any(|m| m.term == "hello" && m.position == 0));
}
#[test]
fn test_scdawg_internal_substring() {
let scdawg = Scdawg::<()>::from_terms(vec!["cathedral"]);
assert!(scdawg.contains_substring("thedr"));
assert!(scdawg.contains_substring("hedr"));
assert!(scdawg.contains_substring("edra"));
}
#[test]
fn test_scdawg_multiple_terms() {
let scdawg = Scdawg::<()>::from_terms(vec!["abc", "bcd", "cde"]);
assert!(scdawg.contains("abc"));
assert!(scdawg.contains("bcd"));
assert!(scdawg.contains("cde"));
assert!(scdawg.contains_substring("bc")); assert!(scdawg.contains_substring("cd")); }
#[test]
fn test_scdawg_iter() {
let terms = vec!["apple", "banana", "cherry"];
let scdawg = Scdawg::<()>::from_terms(terms.clone());
let collected: Vec<_> = scdawg.iter().collect();
assert_eq!(collected.len(), 3);
for term in terms {
assert!(collected.contains(&term.to_string()));
}
}
#[test]
fn test_left_extension_edges() {
use crate::substring::BidirectionalDictionaryNode;
use crate::Dictionary;
let scdawg = Scdawg::<()>::from_terms(vec!["abc", "dbc"]);
let root = scdawg.root();
let node_b = root
.transition(b'b')
.expect("Should have edge 'b' from root");
let node_bc = node_b
.transition(b'c')
.expect("Should have edge 'c' from 'b'");
let left_edges: Vec<_> = node_bc.reverse_edges().collect();
let labels: std::collections::HashSet<_> = left_edges.iter().map(|(l, _)| *l).collect();
assert!(
labels.contains(&b'a'),
"Node 'bc' should have left extension edge with label 'a'. \
Found edges: {:?}",
left_edges
.iter()
.map(|(l, _)| *l as char)
.collect::<Vec<_>>()
);
assert!(
labels.contains(&b'd'),
"Node 'bc' should have left extension edge with label 'd'. \
Found edges: {:?}",
left_edges
.iter()
.map(|(l, _)| *l as char)
.collect::<Vec<_>>()
);
}
#[test]
fn debug_abab_structure() {
let scdawg = Scdawg::<()>::from_terms(vec!["abab"]);
let inner = scdawg.inner.read();
debug!("Node structure for 'abab':");
for (i, node) in inner.nodes.iter().enumerate() {
debug!(
"Node {}: length={}, term_ends={:?}, edges={:?}",
i,
node.length,
node.term_ends,
node.forward_edges
.iter()
.map(|(l, t)| (*l as char, *t))
.collect::<Vec<_>>()
);
}
let ab_node = inner.find_substring_fast("ab").unwrap();
debug!("Node for 'ab': {}", ab_node);
debug!("term_ends at 'ab': {:?}", inner.nodes[ab_node].term_ends);
debug!("children of 'ab': {:?}", inner.nodes[ab_node].forward_edges);
let mut results = Vec::new();
inner.collect_term_positions(ab_node, 2, &mut results);
debug!("Collected positions: {:?}", results);
}
#[test]
fn test_is_find() {
let scdawg = Scdawg::<()>::from_terms(vec!["cathedral", "category"]);
assert!(scdawg.find("cat").is_some());
assert!(scdawg.find("the").is_some());
assert!(scdawg.find("xyz").is_none());
}
#[test]
fn test_is_freq_single_term() {
let scdawg = Scdawg::<()>::from_terms(vec!["abab"]);
assert_eq!(
scdawg.freq("ab"),
2,
"Pattern 'ab' should appear twice in 'abab'"
);
assert_eq!(
scdawg.freq("a"),
2,
"Pattern 'a' should appear twice in 'abab'"
);
assert_eq!(
scdawg.freq("b"),
2,
"Pattern 'b' should appear twice in 'abab'"
);
assert_eq!(scdawg.freq("abab"), 1, "Pattern 'abab' should appear once");
assert_eq!(
scdawg.freq("xyz"),
0,
"Non-existent pattern should have freq 0"
);
}
#[test]
fn test_is_freq_multiple_terms() {
let scdawg = Scdawg::<()>::from_terms(vec!["abc", "bcd", "cde"]);
assert_eq!(scdawg.freq("bc"), 2, "Pattern 'bc' should appear twice");
assert_eq!(scdawg.freq("cd"), 2, "Pattern 'cd' should appear twice");
assert_eq!(scdawg.freq("c"), 3, "Pattern 'c' should appear three times");
}
#[test]
fn test_is_locations() {
let scdawg = Scdawg::<()>::from_terms(vec!["abab"]);
let locs = scdawg.locations("ab");
assert_eq!(locs.len(), 2, "Should find 2 occurrences of 'ab'");
let positions: std::collections::HashSet<_> = locs.iter().map(|(_, pos)| *pos).collect();
assert!(positions.contains(&0), "Should find 'ab' at position 0");
assert!(positions.contains(&2), "Should find 'ab' at position 2");
}
#[test]
fn test_is_locations_multiple_terms() {
let scdawg = Scdawg::<()>::from_terms(vec!["cat", "cathedral", "scatter"]);
let locs = scdawg.locations("cat");
debug!("Locations of 'cat': {:?}", locs);
let term_positions: std::collections::HashSet<_> = locs
.iter()
.map(|(term, pos)| (term.as_str(), *pos))
.collect();
assert!(
term_positions.contains(&("cat", 0)),
"Should find 'cat' at position 0 in 'cat'"
);
assert!(
term_positions.contains(&("cathedral", 0)),
"Should find 'cat' at position 0 in 'cathedral'"
);
assert!(
term_positions.contains(&("scatter", 1)),
"Should find 'cat' at position 1 in 'scatter'. Found: {:?}",
term_positions
);
}
#[test]
fn test_is_freq_at_and_locations_at() {
let scdawg = Scdawg::<()>::from_terms(vec!["abab", "bab"]);
let handle = scdawg.find("ab").expect("Should find 'ab'");
let freq = scdawg.freq_at(&handle);
assert!(freq >= 2, "Should have at least 2 occurrences of 'ab'");
let locs = scdawg.locations_at(&handle, 2);
assert!(!locs.is_empty(), "Should have locations for 'ab'");
}
#[test]
fn test_left_extension_multiple_terms() {
use crate::substring::BidirectionalDictionaryNode;
use crate::Dictionary;
let scdawg = Scdawg::<()>::from_terms(vec!["abc", "xbc"]);
let root = scdawg.root();
let node_b = root.transition(b'b').expect("Should have edge 'b'");
let node_bc = node_b.transition(b'c').expect("Should have edge 'c'");
let left_edges: Vec<_> = node_bc.reverse_edges().collect();
let labels: std::collections::HashSet<_> = left_edges.iter().map(|(l, _)| *l).collect();
assert!(
labels.contains(&b'a'),
"Node 'bc' should have left extension 'a' -> 'abc'"
);
assert!(
labels.contains(&b'x'),
"Node 'bc' should have left extension 'x' -> 'xbc'"
);
}
}