use std::collections::HashMap;
use crate::{StarError, StarResult, StarTerm, StarTriple};
pub type TermId = u64;
pub type QuotedId = u64;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum EncodedTerm {
Iri(TermId),
BlankNode(TermId),
Literal(TermId),
QuotedTriple(QuotedId),
}
impl EncodedTerm {
pub fn is_quoted(&self) -> bool {
matches!(self, Self::QuotedTriple(_))
}
pub fn raw_id(&self) -> u64 {
match self {
Self::Iri(id) | Self::BlankNode(id) | Self::Literal(id) => *id,
Self::QuotedTriple(id) => *id,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
enum TermKey {
Iri(String),
BlankNode(String),
Literal(String),
}
impl TermKey {
fn from_term(term: &StarTerm) -> Option<Self> {
match term {
StarTerm::NamedNode(n) => Some(Self::Iri(n.iri.clone())),
StarTerm::BlankNode(b) => Some(Self::BlankNode(b.id.clone())),
StarTerm::Literal(l) => {
let mut repr = l.value.clone();
if let Some(ref dt) = l.datatype {
repr.push_str("^^");
repr.push_str(&dt.iri);
}
if let Some(ref lang) = l.language {
repr.push('@');
repr.push_str(lang);
}
Some(Self::Literal(repr))
}
StarTerm::Variable(_) | StarTerm::QuotedTriple(_) => None,
}
}
}
pub struct StarDictionary {
str_to_id: HashMap<TermKey, TermId>,
id_to_str: Vec<(TermKey, String)>,
next_term_id: TermId,
quoted_to_id: HashMap<(EncodedTerm, EncodedTerm, EncodedTerm), QuotedId>,
id_to_quoted: Vec<(EncodedTerm, EncodedTerm, EncodedTerm)>,
next_quoted_id: QuotedId,
}
impl StarDictionary {
pub fn new() -> Self {
Self {
str_to_id: HashMap::new(),
id_to_str: vec![(TermKey::Iri(String::new()), String::new())],
next_term_id: 1,
quoted_to_id: HashMap::new(),
id_to_quoted: vec![(
EncodedTerm::Iri(0),
EncodedTerm::Iri(0),
EncodedTerm::Iri(0),
)],
next_quoted_id: 1,
}
}
pub fn encode_term(&mut self, term: &StarTerm) -> StarResult<EncodedTerm> {
match term {
StarTerm::Variable(v) => Err(StarError::invalid_term_type(format!(
"Variables cannot be encoded in the dictionary: ?{}",
v.name
))),
StarTerm::QuotedTriple(inner) => {
let s = self.encode_term(&inner.subject)?;
let p = self.encode_term(&inner.predicate)?;
let o = self.encode_term(&inner.object)?;
Ok(self.intern_quoted(s, p, o))
}
StarTerm::NamedNode(n) => {
let key = TermKey::Iri(n.iri.clone());
Ok(EncodedTerm::Iri(self.intern_str(key, &n.iri)))
}
StarTerm::BlankNode(b) => {
let key = TermKey::BlankNode(b.id.clone());
Ok(EncodedTerm::BlankNode(self.intern_str(key, &b.id)))
}
StarTerm::Literal(_l) => {
let tk = TermKey::from_term(term).expect("literal always has a term key");
let repr = match tk.clone() {
TermKey::Literal(s) => s,
_ => unreachable!(),
};
Ok(EncodedTerm::Literal(self.intern_str(tk, &repr)))
}
}
}
pub fn encode_triple(
&mut self,
triple: &StarTriple,
) -> StarResult<(EncodedTerm, EncodedTerm, EncodedTerm)> {
let s = self.encode_term(&triple.subject)?;
let p = self.encode_term(&triple.predicate)?;
let o = self.encode_term(&triple.object)?;
Ok((s, p, o))
}
pub fn decode_term(&self, encoded: &EncodedTerm) -> Option<StarTerm> {
match encoded {
EncodedTerm::Iri(id) => {
let (key, _) = self.id_to_str.get(*id as usize)?;
if let TermKey::Iri(iri) = key {
StarTerm::iri(iri).ok()
} else {
None
}
}
EncodedTerm::BlankNode(id) => {
let (key, _) = self.id_to_str.get(*id as usize)?;
if let TermKey::BlankNode(bn) = key {
StarTerm::blank_node(bn).ok()
} else {
None
}
}
EncodedTerm::Literal(id) => {
let (key, repr) = self.id_to_str.get(*id as usize)?;
if let TermKey::Literal(_) = key {
Self::decode_literal_repr(repr)
} else {
None
}
}
EncodedTerm::QuotedTriple(id) => {
let (s_enc, p_enc, o_enc) = self.id_to_quoted.get(*id as usize)?;
let s = self.decode_term(s_enc)?;
let p = self.decode_term(p_enc)?;
let o = self.decode_term(o_enc)?;
Some(StarTerm::quoted_triple(StarTriple::new(s, p, o)))
}
}
}
pub fn term_count(&self) -> usize {
self.id_to_str.len().saturating_sub(1)
}
pub fn quoted_count(&self) -> usize {
self.id_to_quoted.len().saturating_sub(1)
}
pub fn memory_bytes(&self) -> usize {
let str_bytes: usize = self
.id_to_str
.iter()
.map(|(_, s)| s.len() + std::mem::size_of::<String>())
.sum();
let str_map_bytes = self.str_to_id.len() * 64;
let quoted_bytes = self.id_to_quoted.len() * 3 * std::mem::size_of::<EncodedTerm>();
let quoted_map_bytes = self.quoted_to_id.len() * 64;
str_bytes + str_map_bytes + quoted_bytes + quoted_map_bytes
}
pub fn next_term_id(&self) -> TermId {
self.next_term_id
}
pub fn next_quoted_id(&self) -> QuotedId {
self.next_quoted_id
}
fn intern_str(&mut self, key: TermKey, repr: &str) -> TermId {
if let Some(&existing) = self.str_to_id.get(&key) {
return existing;
}
let id = self.next_term_id;
self.str_to_id.insert(key.clone(), id);
self.id_to_str.push((key, repr.to_owned()));
self.next_term_id += 1;
id
}
fn intern_quoted(&mut self, s: EncodedTerm, p: EncodedTerm, o: EncodedTerm) -> EncodedTerm {
let key = (s.clone(), p.clone(), o.clone());
if let Some(&existing) = self.quoted_to_id.get(&key) {
return EncodedTerm::QuotedTriple(existing);
}
let id = self.next_quoted_id;
self.quoted_to_id.insert(key, id);
self.id_to_quoted.push((s, p, o));
self.next_quoted_id += 1;
EncodedTerm::QuotedTriple(id)
}
fn decode_literal_repr(repr: &str) -> Option<StarTerm> {
if let Some(dt_pos) = repr.find("^^") {
let value = &repr[..dt_pos];
let datatype = &repr[dt_pos + 2..];
StarTerm::literal_with_datatype(value, datatype).ok()
} else if let Some(lang_pos) = repr.rfind('@') {
let value = &repr[..lang_pos];
let lang = &repr[lang_pos + 1..];
StarTerm::literal_with_language(value, lang).ok()
} else {
StarTerm::literal(repr).ok()
}
}
}
impl Default for StarDictionary {
fn default() -> Self {
Self::new()
}
}
pub struct CompressedTripleSet {
dict: StarDictionary,
triples: Vec<(EncodedTerm, EncodedTerm, EncodedTerm)>,
}
impl CompressedTripleSet {
pub fn new() -> Self {
Self {
dict: StarDictionary::new(),
triples: Vec::new(),
}
}
pub fn insert(&mut self, triple: &StarTriple) -> StarResult<()> {
let encoded = self.dict.encode_triple(triple)?;
self.triples.push(encoded);
Ok(())
}
pub fn iter_triples(&self) -> impl Iterator<Item = Option<StarTriple>> + '_ {
self.triples.iter().map(move |(s, p, o)| {
let s_term = self.dict.decode_term(s)?;
let p_term = self.dict.decode_term(p)?;
let o_term = self.dict.decode_term(o)?;
Some(StarTriple::new(s_term, p_term, o_term))
})
}
pub fn len(&self) -> usize {
self.triples.len()
}
pub fn is_empty(&self) -> bool {
self.triples.is_empty()
}
pub fn dictionary(&self) -> &StarDictionary {
&self.dict
}
pub fn memory_bytes(&self) -> usize {
self.dict.memory_bytes() + self.triples.len() * 3 * std::mem::size_of::<EncodedTerm>()
}
}
impl Default for CompressedTripleSet {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::model::{BlankNode, Literal, NamedNode};
fn iri(s: &str) -> StarTerm {
StarTerm::NamedNode(NamedNode { iri: s.to_owned() })
}
fn blank(id: &str) -> StarTerm {
StarTerm::BlankNode(BlankNode { id: id.to_owned() })
}
fn lit(v: &str) -> StarTerm {
StarTerm::Literal(Literal {
value: v.to_owned(),
language: None,
datatype: None,
})
}
fn lit_lang(v: &str, lang: &str) -> StarTerm {
StarTerm::Literal(Literal {
value: v.to_owned(),
language: Some(lang.to_owned()),
datatype: None,
})
}
fn lit_dt(v: &str, dt: &str) -> StarTerm {
StarTerm::Literal(Literal {
value: v.to_owned(),
language: None,
datatype: Some(NamedNode { iri: dt.to_owned() }),
})
}
#[test]
fn test_iri_round_trip() {
let mut dict = StarDictionary::new();
let term = iri("http://example.org/subject");
let encoded = dict.encode_term(&term).expect("encode ok");
assert!(matches!(encoded, EncodedTerm::Iri(_)));
let decoded = dict.decode_term(&encoded).expect("decode ok");
assert_eq!(decoded, term);
}
#[test]
fn test_blank_node_round_trip() {
let mut dict = StarDictionary::new();
let term = blank("b1");
let encoded = dict.encode_term(&term).expect("encode ok");
assert!(matches!(encoded, EncodedTerm::BlankNode(_)));
let decoded = dict.decode_term(&encoded).expect("decode ok");
assert_eq!(decoded, term);
}
#[test]
fn test_literal_round_trip() {
let mut dict = StarDictionary::new();
let term = lit("hello world");
let encoded = dict.encode_term(&term).expect("encode ok");
assert!(matches!(encoded, EncodedTerm::Literal(_)));
let decoded = dict.decode_term(&encoded).expect("decode ok");
assert_eq!(decoded, term);
}
#[test]
fn test_literal_with_language_round_trip() {
let mut dict = StarDictionary::new();
let term = lit_lang("Bonjour", "fr");
let encoded = dict.encode_term(&term).expect("encode ok");
let decoded = dict.decode_term(&encoded).expect("decode ok");
assert_eq!(decoded, term);
}
#[test]
fn test_literal_with_datatype_round_trip() {
let mut dict = StarDictionary::new();
let term = lit_dt("42", "http://www.w3.org/2001/XMLSchema#integer");
let encoded = dict.encode_term(&term).expect("encode ok");
let decoded = dict.decode_term(&encoded).expect("decode ok");
assert_eq!(decoded, term);
}
#[test]
fn test_deduplication() {
let mut dict = StarDictionary::new();
let alice = iri("http://example.org/alice");
let id1 = dict.encode_term(&alice).expect("encode ok");
let id2 = dict.encode_term(&alice).expect("encode ok");
assert_eq!(id1, id2);
assert_eq!(dict.term_count(), 1);
}
#[test]
fn test_quoted_triple_round_trip() {
let mut dict = StarDictionary::new();
let inner = StarTriple::new(
iri("http://example.org/alice"),
iri("http://example.org/age"),
lit("30"),
);
let quoted = StarTerm::quoted_triple(inner.clone());
let encoded = dict.encode_term("ed).expect("encode ok");
assert!(encoded.is_quoted());
let decoded = dict.decode_term(&encoded).expect("decode ok");
assert_eq!(decoded, quoted);
}
#[test]
fn test_nested_quoted_triple_round_trip() {
let mut dict = StarDictionary::new();
let inner = StarTriple::new(
iri("http://ex.org/s"),
iri("http://ex.org/p"),
iri("http://ex.org/o"),
);
let outer = StarTriple::new(
StarTerm::quoted_triple(inner),
iri("http://ex.org/meta"),
lit("evidence"),
);
let doubly_quoted = StarTerm::quoted_triple(outer);
let encoded = dict.encode_term(&doubly_quoted).expect("encode ok");
let decoded = dict.decode_term(&encoded).expect("decode ok");
assert_eq!(decoded, doubly_quoted);
}
#[test]
fn test_quoted_deduplication() {
let mut dict = StarDictionary::new();
let triple = StarTriple::new(
iri("http://ex.org/s"),
iri("http://ex.org/p"),
iri("http://ex.org/o"),
);
let qt = StarTerm::quoted_triple(triple);
let id1 = dict.encode_term(&qt).expect("encode ok");
let id2 = dict.encode_term(&qt).expect("encode ok");
assert_eq!(id1, id2);
assert_eq!(dict.quoted_count(), 1);
}
#[test]
fn test_variable_encoding_fails() {
let mut dict = StarDictionary::new();
let var = StarTerm::variable("x").expect("valid variable");
assert!(dict.encode_term(&var).is_err());
}
#[test]
fn test_term_count_is_accurate() {
let mut dict = StarDictionary::new();
dict.encode_term(&iri("http://ex.org/a")).expect("ok");
dict.encode_term(&iri("http://ex.org/b")).expect("ok");
dict.encode_term(&blank("b1")).expect("ok");
dict.encode_term(&lit("hello")).expect("ok");
dict.encode_term(&iri("http://ex.org/a")).expect("ok");
assert_eq!(dict.term_count(), 4);
}
#[test]
fn test_compressed_triple_set() {
let mut set = CompressedTripleSet::new();
for i in 0..100_usize {
let triple = StarTriple::new(
iri(&format!("http://ex.org/s{i}")),
iri("http://ex.org/p"),
lit(&format!("{i}")),
);
set.insert(&triple).expect("insert ok");
}
assert_eq!(set.len(), 100);
assert_eq!(set.dictionary().term_count(), 201);
let decoded: Vec<StarTriple> = set
.iter_triples()
.collect::<Vec<_>>()
.into_iter()
.flatten()
.collect();
assert_eq!(decoded.len(), 100);
}
#[test]
fn test_memory_bytes_grows_with_insertions() {
let mut dict = StarDictionary::new();
let before = dict.memory_bytes();
for i in 0..50_usize {
dict.encode_term(&iri(&format!("http://ex.org/r{i}")))
.expect("ok");
}
let after = dict.memory_bytes();
assert!(after > before, "memory should grow after insertions");
}
#[test]
fn test_encode_triple() {
let mut dict = StarDictionary::new();
let triple = StarTriple::new(
iri("http://ex.org/s"),
iri("http://ex.org/p"),
iri("http://ex.org/o"),
);
let (s, p, o) = dict.encode_triple(&triple).expect("encode ok");
assert!(matches!(s, EncodedTerm::Iri(_)));
assert!(matches!(p, EncodedTerm::Iri(_)));
assert!(matches!(o, EncodedTerm::Iri(_)));
}
}