use std::sync::Arc;
use crate::substring::{BidirectionalDictionaryNode, SubstringDictionary, SubstringMatch};
use crate::sync_compat::RwLock;
use crate::value::DictionaryValue;
use crate::{Dictionary, DictionaryNode};
const NIL: usize = usize::MAX;
type ScdawgCharNode<V = ()> = crate::scdawg_core::ScdawgNode<char, V>;
type ScdawgCharInner<V = ()> = crate::scdawg_core::ScdawgCoreInner<char, V>;
#[derive(Clone, Debug)]
pub struct ScdawgChar<V: DictionaryValue = ()> {
inner: Arc<RwLock<ScdawgCharInner<V>>>,
}
impl<V: DictionaryValue> Default for ScdawgChar<V> {
fn default() -> Self {
Self::new()
}
}
impl<V: DictionaryValue> ScdawgChar<V> {
pub fn new() -> Self {
Self {
inner: Arc::new(RwLock::new(ScdawgCharInner::new())),
}
}
pub fn from_terms<I, S>(terms: I) -> Self
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let terms_vec: Vec<S> = terms.into_iter().collect();
let term_count = terms_vec.len();
let total_chars: usize = terms_vec.iter().map(|s| s.as_ref().chars().count()).sum();
let inner = ScdawgCharInner::with_capacity(term_count, total_chars);
let scdawg = Self {
inner: Arc::new(RwLock::new(inner)),
};
{
let mut inner = scdawg.inner.write();
for term in terms_vec {
inner.insert(term.as_ref());
}
inner.compute_left_edges();
}
scdawg
}
pub fn from_terms_with_values<I, S>(terms: I) -> Self
where
I: IntoIterator<Item = (S, V)>,
S: AsRef<str>,
{
let scdawg = ScdawgChar::new();
for (term, value) in terms {
scdawg.insert_with_value(term.as_ref(), value);
}
scdawg
}
pub fn insert(&self, term: &str) -> bool {
let mut inner = self.inner.write();
let result = inner.insert(term);
if result {
inner.compute_left_edges();
}
result
}
pub fn insert_with_value(&self, term: &str, value: V) -> bool {
let mut inner = self.inner.write();
let result = inner.insert_with_value(term, value);
if result {
inner.compute_left_edges();
}
result
}
pub fn contains_substring(&self, pattern: &str) -> bool {
let inner = self.inner.read();
inner.contains_substring(pattern)
}
pub fn iter(&self) -> impl Iterator<Item = String> {
let inner = self.inner.read();
inner.terms.clone().into_iter()
}
pub fn term_count(&self) -> usize {
self.inner.read().term_count()
}
pub fn node_count(&self) -> usize {
self.inner.read().nodes.len()
}
pub fn get_value(&self, term: &str) -> Option<V>
where
V: Clone,
{
let inner = self.inner.read();
if let Some(value) = inner.term_values.get(term) {
return Some(value.clone());
}
let mut current = 0;
for ch in term.chars() {
match inner.nodes[current].get_edge(ch) {
Some(next) => current = next,
None => return None,
}
}
if inner.nodes[current].is_final {
inner.nodes[current].value.clone()
} else {
None
}
}
pub fn find(&self, pattern: &str) -> Option<ScdawgCharNodeHandle<V>> {
let inner = self.inner.read();
inner
.find_substring_fast(pattern)
.map(|node_idx| ScdawgCharNodeHandle {
inner: Arc::clone(&self.inner),
node_idx,
})
}
pub fn freq(&self, pattern: &str) -> usize {
let inner = self.inner.read();
inner.frequency(pattern)
}
pub fn freq_at(&self, handle: &ScdawgCharNodeHandle<V>) -> usize {
let inner = self.inner.read();
let mut count = 0;
inner.count_occurrences(handle.node_idx, &mut count);
count
}
pub fn locations(&self, pattern: &str) -> Vec<(String, usize)> {
let inner = self.inner.read();
inner.find_exact_substring(pattern)
}
pub fn locations_at(
&self,
handle: &ScdawgCharNodeHandle<V>,
pattern_len: usize,
) -> Vec<(String, usize)> {
let inner = self.inner.read();
let mut results = Vec::new();
inner.collect_term_positions(handle.node_idx, pattern_len, &mut results);
results
}
}
impl<V: DictionaryValue> Dictionary for ScdawgChar<V> {
type Node = ScdawgCharNodeHandle<V>;
fn len(&self) -> Option<usize> {
Some(self.inner.read().term_count())
}
fn contains(&self, term: &str) -> bool {
self.inner.read().contains(term)
}
fn root(&self) -> Self::Node {
ScdawgCharNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: 0,
}
}
fn sync_strategy(&self) -> crate::SyncStrategy {
crate::SyncStrategy::ExternalSync
}
}
impl<V: DictionaryValue> crate::MappedDictionary for ScdawgChar<V> {
type Value = V;
fn get_value(&self, term: &str) -> Option<Self::Value> {
Self::get_value(self, term)
}
}
#[derive(Clone)]
pub struct ScdawgCharNodeHandle<V: DictionaryValue = ()> {
inner: Arc<RwLock<ScdawgCharInner<V>>>,
node_idx: usize,
}
impl<V: DictionaryValue> std::fmt::Debug for ScdawgCharNodeHandle<V> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("ScdawgCharNodeHandle")
.field("node_idx", &self.node_idx)
.finish()
}
}
impl<V: DictionaryValue> DictionaryNode for ScdawgCharNodeHandle<V> {
type Unit = char;
fn is_final(&self) -> bool {
let inner = self.inner.read();
inner.nodes[self.node_idx].is_final
}
fn transition(&self, label: char) -> Option<Self> {
let inner = self.inner.read();
inner.nodes[self.node_idx]
.get_edge(label)
.map(|idx| ScdawgCharNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: idx,
})
}
fn edges(&self) -> Box<dyn Iterator<Item = (char, Self)> + '_> {
let inner = self.inner.read();
let edges: Vec<_> = inner.nodes[self.node_idx]
.forward_edges
.iter()
.map(|&(label, idx)| {
(
label,
ScdawgCharNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: idx,
},
)
})
.collect();
Box::new(edges.into_iter())
}
fn edge_count(&self) -> Option<usize> {
let inner = self.inner.read();
Some(inner.nodes[self.node_idx].forward_edges.len())
}
}
unsafe impl<V: DictionaryValue> Send for ScdawgCharNodeHandle<V> {}
unsafe impl<V: DictionaryValue> Sync for ScdawgCharNodeHandle<V> {}
impl<V: DictionaryValue> BidirectionalDictionaryNode for ScdawgCharNodeHandle<V> {
fn parent(&self) -> Option<Self> {
let inner = self.inner.read();
let node = &inner.nodes[self.node_idx];
if node.parent == NIL {
None
} else {
Some(ScdawgCharNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: node.parent,
})
}
}
fn parent_label(&self) -> Option<char> {
let inner = self.inner.read();
let node = &inner.nodes[self.node_idx];
if node.parent == NIL {
None
} else {
Some(node.parent_label)
}
}
fn reverse_edges(&self) -> Box<dyn Iterator<Item = (char, Self)> + '_> {
let inner = self.inner.read();
let edges: Vec<_> = inner.nodes[self.node_idx]
.left_edges
.iter()
.map(|&(label, idx)| {
(
label,
ScdawgCharNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: idx,
},
)
})
.collect();
Box::new(edges.into_iter())
}
fn reverse_transition(&self, label: char) -> Vec<Self> {
let inner = self.inner.read();
inner.nodes[self.node_idx]
.left_edges
.iter()
.filter(|(l, _)| *l == label)
.map(|(_, idx)| ScdawgCharNodeHandle {
inner: Arc::clone(&self.inner),
node_idx: *idx,
})
.collect()
}
fn depth(&self) -> usize {
let inner = self.inner.read();
inner.nodes[self.node_idx].depth
}
}
impl<V: DictionaryValue> SubstringDictionary for ScdawgChar<V> {
fn find_exact_substring(&self, pattern: &str) -> Vec<SubstringMatch<Self::Node>> {
let inner = self.inner.read();
let occurrences = inner.find_exact_substring(pattern);
let pattern_len = pattern.chars().count();
occurrences
.into_iter()
.map(|(term, position)| {
let mut node_idx = 0;
for ch in term.chars().take(position + pattern_len) {
if let Some(next) = inner.nodes[node_idx].get_edge(ch) {
node_idx = next;
}
}
SubstringMatch::new(
ScdawgCharNodeHandle {
inner: Arc::clone(&self.inner),
node_idx,
},
term,
position,
pattern_len,
)
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_scdawg_char_empty() {
let scdawg = ScdawgChar::<()>::new();
assert_eq!(scdawg.term_count(), 0);
assert!(!scdawg.contains("anything"));
}
#[test]
fn test_scdawg_char_insert_single() {
let scdawg = ScdawgChar::<()>::new();
assert!(scdawg.insert("hello"));
assert!(!scdawg.insert("hello")); assert_eq!(scdawg.term_count(), 1);
assert!(scdawg.contains("hello"));
}
#[test]
fn test_scdawg_char_unicode() {
let scdawg = ScdawgChar::<()>::from_terms(vec!["café", "naïve", "中文"]);
assert_eq!(scdawg.term_count(), 3);
assert!(scdawg.contains("café"));
assert!(scdawg.contains("naïve"));
assert!(scdawg.contains("中文"));
assert!(!scdawg.contains("cafe")); }
#[test]
fn test_scdawg_char_substring_search() {
let scdawg = ScdawgChar::<()>::from_terms(vec!["café"]);
assert!(scdawg.contains_substring("afé"));
assert!(scdawg.contains_substring("ca"));
assert!(scdawg.contains_substring("fé"));
assert!(!scdawg.contains_substring("xyz"));
}
#[test]
fn test_scdawg_char_find_exact_substring() {
let scdawg = ScdawgChar::<()>::from_terms(vec!["café"]);
let matches = scdawg.find_exact_substring("afé");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].term, "café");
assert_eq!(matches[0].position, 1); assert_eq!(matches[0].length, 3); }
#[test]
fn test_scdawg_char_cjk() {
let scdawg = ScdawgChar::<()>::from_terms(vec!["中文字"]);
assert!(scdawg.contains_substring("中"));
assert!(scdawg.contains_substring("中文"));
assert!(scdawg.contains_substring("文字"));
assert!(scdawg.contains_substring("中文字"));
let matches = scdawg.find_exact_substring("文");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].position, 1); }
#[test]
fn test_scdawg_char_bidirectional() {
let scdawg = ScdawgChar::<()>::from_terms(vec!["中文"]);
let root = scdawg.root();
let zhong = root.transition('中').unwrap();
let wen = zhong.transition('文').unwrap();
assert!(wen.is_final());
assert_eq!(wen.depth(), 2);
let back = wen.parent().unwrap();
assert_eq!(wen.parent_label(), Some('文'));
assert_eq!(back.depth(), 1);
let back_root = back.parent().unwrap();
assert!(back_root.parent().is_none());
}
#[test]
fn test_scdawg_char_path_string() {
let scdawg = ScdawgChar::<()>::from_terms(vec!["café"]);
let root = scdawg.root();
let c = root.transition('c').unwrap();
let a = c.transition('a').unwrap();
let f = a.transition('f').unwrap();
let e = f.transition('é').unwrap();
assert_eq!(e.path_string(), "café");
assert_eq!(a.path_string(), "ca");
}
#[test]
fn test_scdawg_char_with_values() {
let scdawg = ScdawgChar::<u32>::new();
scdawg.insert_with_value("日本語", 42);
assert_eq!(scdawg.get_value("日本語"), Some(42));
assert_eq!(scdawg.get_value("日本"), None);
}
#[test]
fn test_scdawg_char_emoji() {
let scdawg = ScdawgChar::<()>::from_terms(vec!["hello🎉world"]);
assert!(scdawg.contains("hello🎉world"));
assert_eq!(scdawg.term_count(), 1);
let matches = scdawg.find_exact_substring("🎉");
assert_eq!(matches.len(), 1);
assert_eq!(matches[0].position, 5); }
#[test]
fn test_scdawg_char_multiple_terms() {
let scdawg = ScdawgChar::<()>::from_terms(vec!["abc", "bcd", "cde"]);
assert!(scdawg.contains("abc"));
assert!(scdawg.contains("bcd"));
assert!(scdawg.contains("cde"));
assert!(scdawg.contains_substring("bc")); assert!(scdawg.contains_substring("cd")); }
#[test]
fn test_scdawg_char_is_freq() {
let scdawg = ScdawgChar::<()>::from_terms(vec!["abab"]);
assert_eq!(scdawg.freq("ab"), 2);
assert_eq!(scdawg.freq("a"), 2);
assert_eq!(scdawg.freq("xyz"), 0);
}
#[test]
fn test_scdawg_char_is_locations() {
let scdawg = ScdawgChar::<()>::from_terms(vec!["abab"]);
let locs = scdawg.locations("ab");
assert_eq!(locs.len(), 2);
let positions: std::collections::HashSet<_> = locs.iter().map(|(_, pos)| *pos).collect();
assert!(positions.contains(&0));
assert!(positions.contains(&2));
}
#[test]
fn test_scdawg_char_left_extension_edges() {
let scdawg = ScdawgChar::<()>::from_terms(vec!["abc", "dbc"]);
let root = scdawg.root();
let node_b = root.transition('b').expect("Should have edge 'b'");
let node_bc = node_b.transition('c').expect("Should have edge 'c'");
let left_edges: Vec<_> = node_bc.reverse_edges().collect();
let labels: std::collections::HashSet<_> = left_edges.iter().map(|(l, _)| *l).collect();
assert!(labels.contains(&'a'), "Should have left extension 'a'");
assert!(labels.contains(&'d'), "Should have left extension 'd'");
}
}