use crate::{Dictionary, DictionaryNode};
use std::io::{Read, Write};
mod bincode_impl;
mod json_impl;
mod plaintext_impl;
#[cfg(feature = "protobuf")]
pub mod protobuf_impl;
#[cfg(feature = "compression")]
mod compression_impl;
#[cfg(feature = "serialization")]
pub(crate) mod serde_helpers;
#[cfg(feature = "serialization")]
pub mod bincode_compat;
pub use self::bincode_impl::BincodeSerializer;
pub use self::json_impl::JsonSerializer;
pub use self::plaintext_impl::PlainTextSerializer;
#[cfg(feature = "protobuf")]
pub use self::protobuf_impl::{
DatProtobufSerializer, OptimizedProtobufSerializer, ProtobufSerializer,
SuffixAutomatonProtobufSerializer,
};
#[cfg(feature = "compression")]
pub use self::compression_impl::GzipSerializer;
pub trait DictionarySerializer {
fn serialize<D, W>(dict: &D, writer: W) -> Result<(), SerializationError>
where
D: Dictionary,
D::Node: DictionaryNode<Unit = u8>,
W: Write;
fn deserialize<D, R>(reader: R) -> Result<D, SerializationError>
where
D: DictionaryFromTerms,
R: Read;
}
pub trait DictionaryFromTerms: Sized {
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self;
}
pub trait DictionaryFromTermsWithValues: Sized {
type Value: crate::DictionaryValue;
fn from_terms_with_values<I>(entries: I) -> Self
where
I: IntoIterator<Item = (String, Self::Value)>;
}
#[derive(Debug, thiserror::Error)]
pub enum SerializationError {
#[error("Bincode error")]
Bincode(#[from] crate::serialization::bincode_compat::BincodeError),
#[error("JSON error")]
Json(#[from] serde_json::Error),
#[cfg(feature = "protobuf")]
#[error("Protobuf error")]
Protobuf(#[from] prost::DecodeError),
#[error("I/O error")]
Io(#[from] std::io::Error),
#[error("Dictionary error: {0}")]
DictionaryError(String),
}
pub fn extract_terms<D>(dict: &D) -> Vec<String>
where
D: Dictionary,
D::Node: DictionaryNode<Unit = u8>,
{
let est_size = dict.len().unwrap_or(100);
let mut terms: Vec<String> = Vec::with_capacity(est_size);
struct Frame<N: DictionaryNode<Unit = u8>> {
children: Vec<(u8, N)>,
depth: usize,
}
let mut current_term: Vec<u8> = Vec::with_capacity(64);
let root = dict.root();
push_term_if_final(&root, ¤t_term, &mut terms);
let mut stack: Vec<Frame<D::Node>> = Vec::with_capacity(64);
let mut root_children: Vec<(u8, D::Node)> = root.edges().collect();
root_children.reverse();
stack.push(Frame {
children: root_children,
depth: 0,
});
while let Some(frame) = stack.last_mut() {
match frame.children.pop() {
Some((byte, child)) => {
let parent_depth = current_term.len();
current_term.push(byte);
push_term_if_final(&child, ¤t_term, &mut terms);
let mut child_children: Vec<(u8, D::Node)> = child.edges().collect();
child_children.reverse();
drop(child);
stack.push(Frame {
children: child_children,
depth: parent_depth,
});
}
None => {
current_term.truncate(frame.depth);
stack.pop();
}
}
}
terms
}
#[inline]
fn push_term_if_final<N: DictionaryNode<Unit = u8>>(
node: &N,
current_term: &[u8],
terms: &mut Vec<String>,
) {
if node.is_final() {
match std::str::from_utf8(current_term) {
Ok(s) => terms.push(s.to_string()),
Err(_) => terms.push(String::from_utf8_lossy(current_term).into_owned()),
}
}
}
pub fn extract_terms_char<D>(dict: &D) -> Vec<String>
where
D: Dictionary,
D::Node: DictionaryNode<Unit = char>,
{
let est_size = dict.len().unwrap_or(100);
let mut terms: Vec<String> = Vec::with_capacity(est_size);
struct Frame<N: DictionaryNode<Unit = char>> {
children: Vec<(char, N)>,
depth: usize,
}
let mut current_term: Vec<char> = Vec::with_capacity(64);
let root = dict.root();
push_char_term_if_final(&root, ¤t_term, &mut terms);
let mut stack: Vec<Frame<D::Node>> = Vec::with_capacity(64);
let mut root_children: Vec<(char, D::Node)> = root.edges().collect();
root_children.reverse();
stack.push(Frame {
children: root_children,
depth: 0,
});
while let Some(frame) = stack.last_mut() {
match frame.children.pop() {
Some((ch, child)) => {
let parent_depth = current_term.len();
current_term.push(ch);
push_char_term_if_final(&child, ¤t_term, &mut terms);
let mut child_children: Vec<(char, D::Node)> = child.edges().collect();
child_children.reverse();
drop(child);
stack.push(Frame {
children: child_children,
depth: parent_depth,
});
}
None => {
current_term.truncate(frame.depth);
stack.pop();
}
}
}
terms
}
#[inline]
fn push_char_term_if_final<N: DictionaryNode<Unit = char>>(
node: &N,
current_term: &[char],
terms: &mut Vec<String>,
) {
if node.is_final() {
terms.push(current_term.iter().collect());
}
}
pub fn extract_terms_with_values_char<D>(dict: &D) -> Vec<(String, D::Value)>
where
D: crate::MappedDictionary,
D::Node: DictionaryNode<Unit = char>,
{
let terms = extract_terms_char(dict);
let mut out = Vec::with_capacity(terms.len());
for term in terms {
if let Some(value) = dict.get_value(&term) {
out.push((term, value));
}
}
out
}
pub fn extract_terms_with_values<D>(dict: &D) -> Vec<(String, D::Value)>
where
D: crate::MappedDictionary,
D::Node: DictionaryNode<Unit = u8>,
{
let terms = extract_terms(dict);
let mut out = Vec::with_capacity(terms.len());
for term in terms {
if let Some(value) = dict.get_value(&term) {
out.push((term, value));
}
}
out
}
impl DictionaryFromTerms for crate::double_array_trie::DoubleArrayTrie {
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self {
crate::double_array_trie::DoubleArrayTrie::from_terms(terms)
}
}
impl DictionaryFromTerms for crate::double_array_trie_char::DoubleArrayTrieChar {
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self {
crate::double_array_trie_char::DoubleArrayTrieChar::from_terms(terms)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTerms for crate::dynamic_dawg::DynamicDawg<V> {
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self {
crate::dynamic_dawg::DynamicDawg::from_terms(terms)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTerms
for crate::dynamic_dawg_char::DynamicDawgChar<V>
{
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self {
crate::dynamic_dawg_char::DynamicDawgChar::from_terms(terms)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTerms for crate::dynamic_dawg_u64::DynamicDawgU64<V> {
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self {
crate::dynamic_dawg_u64::DynamicDawgU64::from_terms(terms)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTerms
for crate::suffix_automaton::SuffixAutomaton<V>
{
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self {
crate::suffix_automaton::SuffixAutomaton::from_texts(terms)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTerms
for crate::suffix_automaton_char::SuffixAutomatonChar<V>
{
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self {
crate::suffix_automaton_char::SuffixAutomatonChar::from_texts(terms)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTerms for crate::scdawg::Scdawg<V> {
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self {
crate::scdawg::Scdawg::from_terms(terms)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTerms for crate::scdawg_char::ScdawgChar<V> {
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self {
crate::scdawg_char::ScdawgChar::from_terms(terms)
}
}
#[cfg(feature = "pathmap-backend")]
impl<V: crate::DictionaryValue + Default> DictionaryFromTerms
for crate::pathmap::PathMapDictionary<V>
{
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self {
crate::pathmap::PathMapDictionary::from_terms(terms)
}
}
#[cfg(feature = "pathmap-backend")]
impl<V: crate::DictionaryValue + Default> DictionaryFromTerms
for crate::pathmap_char::PathMapDictionaryChar<V>
{
fn from_terms<I: IntoIterator<Item = String>>(terms: I) -> Self {
crate::pathmap_char::PathMapDictionaryChar::from_terms(terms)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTermsWithValues
for crate::double_array_trie::DoubleArrayTrie<V>
{
type Value = V;
fn from_terms_with_values<I>(entries: I) -> Self
where
I: IntoIterator<Item = (String, Self::Value)>,
{
crate::double_array_trie::DoubleArrayTrie::from_terms_with_values(entries)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTermsWithValues
for crate::double_array_trie_char::DoubleArrayTrieChar<V>
{
type Value = V;
fn from_terms_with_values<I>(entries: I) -> Self
where
I: IntoIterator<Item = (String, Self::Value)>,
{
crate::double_array_trie_char::DoubleArrayTrieChar::from_terms_with_values(entries)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTermsWithValues
for crate::dynamic_dawg::DynamicDawg<V>
{
type Value = V;
fn from_terms_with_values<I>(entries: I) -> Self
where
I: IntoIterator<Item = (String, Self::Value)>,
{
crate::dynamic_dawg::DynamicDawg::from_terms_with_values(entries)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTermsWithValues
for crate::dynamic_dawg_char::DynamicDawgChar<V>
{
type Value = V;
fn from_terms_with_values<I>(entries: I) -> Self
where
I: IntoIterator<Item = (String, Self::Value)>,
{
crate::dynamic_dawg_char::DynamicDawgChar::from_terms_with_values(entries)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTermsWithValues
for crate::dynamic_dawg_u64::DynamicDawgU64<V>
{
type Value = V;
fn from_terms_with_values<I>(entries: I) -> Self
where
I: IntoIterator<Item = (String, Self::Value)>,
{
crate::dynamic_dawg_u64::DynamicDawgU64::from_terms_with_values(entries)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTermsWithValues for crate::scdawg::Scdawg<V> {
type Value = V;
fn from_terms_with_values<I>(entries: I) -> Self
where
I: IntoIterator<Item = (String, Self::Value)>,
{
crate::scdawg::Scdawg::from_terms_with_values(entries)
}
}
impl<V: crate::DictionaryValue> DictionaryFromTermsWithValues
for crate::scdawg_char::ScdawgChar<V>
{
type Value = V;
fn from_terms_with_values<I>(entries: I) -> Self
where
I: IntoIterator<Item = (String, Self::Value)>,
{
crate::scdawg_char::ScdawgChar::from_terms_with_values(entries)
}
}
#[cfg(feature = "pathmap-backend")]
impl<V: crate::DictionaryValue + Default> DictionaryFromTermsWithValues
for crate::pathmap::PathMapDictionary<V>
{
type Value = V;
fn from_terms_with_values<I>(entries: I) -> Self
where
I: IntoIterator<Item = (String, Self::Value)>,
{
crate::pathmap::PathMapDictionary::from_terms_with_values(entries)
}
}
#[cfg(feature = "pathmap-backend")]
impl<V: crate::DictionaryValue + Default> DictionaryFromTermsWithValues
for crate::pathmap_char::PathMapDictionaryChar<V>
{
type Value = V;
fn from_terms_with_values<I>(entries: I) -> Self
where
I: IntoIterator<Item = (String, Self::Value)>,
{
crate::pathmap_char::PathMapDictionaryChar::from_terms_with_values(entries)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::double_array_trie::DoubleArrayTrie;
#[test]
fn test_bincode_roundtrip() {
let dict = DoubleArrayTrie::from_terms(vec!["hello", "world", "test"]);
let mut buffer = Vec::new();
BincodeSerializer::serialize(&dict, &mut buffer).unwrap();
let loaded: DoubleArrayTrie = BincodeSerializer::deserialize(&buffer[..]).unwrap();
assert!(loaded.contains("hello"));
assert!(loaded.contains("world"));
assert!(loaded.contains("test"));
assert!(!loaded.contains("missing"));
}
#[test]
fn test_json_roundtrip() {
let dict = DoubleArrayTrie::from_terms(vec!["alpha", "beta", "gamma"]);
let mut buffer = Vec::new();
JsonSerializer::serialize(&dict, &mut buffer).unwrap();
let loaded: DoubleArrayTrie = JsonSerializer::deserialize(&buffer[..]).unwrap();
assert!(loaded.contains("alpha"));
assert!(loaded.contains("beta"));
assert!(loaded.contains("gamma"));
assert!(!loaded.contains("delta"));
}
#[test]
fn test_extract_terms() {
let dict = DoubleArrayTrie::from_terms(vec!["apple", "apply", "application"]);
let terms = extract_terms(&dict);
assert_eq!(terms.len(), 3);
assert!(terms.contains(&"apple".to_string()));
assert!(terms.contains(&"apply".to_string()));
assert!(terms.contains(&"application".to_string()));
}
#[test]
fn test_extract_terms_deep_chain_does_not_stack_overflow() {
const DEPTH: usize = 1024;
let long_term: String = std::iter::repeat('a').take(DEPTH).collect();
let dict = DoubleArrayTrie::from_terms(vec![long_term.clone()]);
let terms = extract_terms(&dict);
assert_eq!(terms.len(), 1, "expected exactly one term; got {:?}", terms);
assert_eq!(terms[0].len(), DEPTH);
assert_eq!(terms[0], long_term);
}
#[test]
fn test_extract_terms_deep_chain_dynamic_dawg() {
use crate::dynamic_dawg::DynamicDawg;
const DEPTH: usize = 50_000;
let long_term: String = std::iter::repeat('a').take(DEPTH).collect();
let dict: DynamicDawg<()> = DynamicDawg::from_terms(vec![long_term.clone()]);
let terms = extract_terms(&dict);
assert_eq!(
terms.len(),
1,
"expected exactly one term; got {:?} entries",
terms.len()
);
assert_eq!(terms[0].len(), DEPTH);
assert_eq!(terms[0], long_term);
}
#[test]
fn test_suffix_automaton_serialization() {
use crate::suffix_automaton::SuffixAutomaton;
let texts = vec!["hello world".to_string(), "test string".to_string()];
let dict = SuffixAutomaton::from_texts(texts.clone());
let mut buffer = Vec::new();
BincodeSerializer::serialize_suffix_automaton(&dict, &mut buffer).unwrap();
let loaded = BincodeSerializer::deserialize_suffix_automaton(&buffer[..]).unwrap();
assert!(loaded.contains("hello"));
assert!(loaded.contains("world"));
assert!(loaded.contains("test"));
assert!(loaded.contains("string"));
assert!(!loaded.contains("missing"));
let sources = loaded.source_texts();
assert_eq!(sources.len(), 2);
assert!(sources.contains(&"hello world".to_string()));
assert!(sources.contains(&"test string".to_string()));
}
#[cfg(feature = "protobuf")]
#[test]
fn test_suffix_automaton_protobuf_serialization() {
use crate::serialization::SuffixAutomatonProtobufSerializer;
use crate::suffix_automaton::SuffixAutomaton;
let texts = vec!["hello world".to_string(), "test string".to_string()];
let dict = SuffixAutomaton::from_texts(texts.clone());
let mut buffer = Vec::new();
SuffixAutomatonProtobufSerializer::serialize_suffix_automaton(&dict, &mut buffer).unwrap();
let loaded =
SuffixAutomatonProtobufSerializer::deserialize_suffix_automaton(&buffer[..]).unwrap();
assert!(loaded.contains("hello"));
assert!(loaded.contains("world"));
assert!(loaded.contains("test"));
assert!(loaded.contains("string"));
assert!(!loaded.contains("missing"));
let sources = loaded.source_texts();
assert_eq!(sources.len(), 2);
assert!(sources.contains(&"hello world".to_string()));
assert!(sources.contains(&"test string".to_string()));
}
#[cfg(feature = "protobuf")]
#[test]
fn test_dat_protobuf_serialization() {
use crate::serialization::DatProtobufSerializer;
let dict = DoubleArrayTrie::from_terms(vec!["apple", "apply", "application"]);
let mut buffer = Vec::new();
DatProtobufSerializer::serialize_dat(&dict, &mut buffer).unwrap();
let loaded = DatProtobufSerializer::deserialize_dat(&buffer[..]).unwrap();
assert!(loaded.contains("apple"));
assert!(loaded.contains("apply"));
assert!(loaded.contains("application"));
assert!(!loaded.contains("app"));
assert!(!loaded.contains("banana"));
}
#[cfg(feature = "protobuf")]
#[test]
fn test_protobuf_roundtrip() {
let dict = DoubleArrayTrie::from_terms(vec!["test", "testing", "tested"]);
let mut buffer = Vec::new();
ProtobufSerializer::serialize(&dict, &mut buffer).unwrap();
let loaded: DoubleArrayTrie = ProtobufSerializer::deserialize(&buffer[..]).unwrap();
assert!(loaded.contains("test"));
assert!(loaded.contains("testing"));
assert!(loaded.contains("tested"));
assert!(!loaded.contains("tester"));
}
#[cfg(feature = "protobuf")]
#[test]
fn test_optimized_protobuf_roundtrip() {
let dict = DoubleArrayTrie::from_terms(vec!["alpha", "beta", "gamma"]);
let mut buffer = Vec::new();
OptimizedProtobufSerializer::serialize(&dict, &mut buffer).unwrap();
let loaded: DoubleArrayTrie =
OptimizedProtobufSerializer::deserialize(&buffer[..]).unwrap();
assert!(loaded.contains("alpha"));
assert!(loaded.contains("beta"));
assert!(loaded.contains("gamma"));
assert!(!loaded.contains("delta"));
}
#[cfg(feature = "protobuf")]
#[test]
fn test_protobuf_format_comparison() {
let dict = DoubleArrayTrie::from_terms(vec![
"test",
"testing",
"tested",
"tester",
"tests",
"apple",
"apply",
"application",
"applicable",
]);
let mut buf_v1 = Vec::new();
ProtobufSerializer::serialize(&dict, &mut buf_v1).unwrap();
let mut buf_v2 = Vec::new();
OptimizedProtobufSerializer::serialize(&dict, &mut buf_v2).unwrap();
let mut buf_dat = Vec::new();
DatProtobufSerializer::serialize_dat(&dict, &mut buf_dat).unwrap();
assert!(
buf_v2.len() < buf_v1.len(),
"V2 ({} bytes) should be smaller than V1 ({} bytes)",
buf_v2.len(),
buf_v1.len()
);
println!("Protobuf V1 size: {} bytes", buf_v1.len());
println!("Protobuf V2 size: {} bytes", buf_v2.len());
println!("DAT format size: {} bytes", buf_dat.len());
let loaded_v1: DoubleArrayTrie = ProtobufSerializer::deserialize(&buf_v1[..]).unwrap();
let loaded_v2: DoubleArrayTrie =
OptimizedProtobufSerializer::deserialize(&buf_v2[..]).unwrap();
let loaded_dat = DatProtobufSerializer::deserialize_dat(&buf_dat[..]).unwrap();
for term in ["test", "testing", "apple", "application"] {
assert!(loaded_v1.contains(term));
assert!(loaded_v2.contains(term));
assert!(loaded_dat.contains(term));
}
}
}