use std::collections::HashMap;
pub type NodeId = u32;
const NULL_ID: NodeId = u32::MAX;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum RdfTerm {
Iri(String),
BlankNode(String),
PlainLiteral(String),
LangLiteral { value: String, lang: String },
TypedLiteral { value: String, datatype: String },
}
impl RdfTerm {
pub fn canonical_key(&self) -> String {
match self {
RdfTerm::Iri(iri) => format!("I:{}", iri),
RdfTerm::BlankNode(id) => format!("B:{}", id),
RdfTerm::PlainLiteral(v) => format!("L:{}", v),
RdfTerm::LangLiteral { value, lang } => format!("LL:{}@{}", value, lang),
RdfTerm::TypedLiteral { value, datatype } => {
format!("LT:{}^^{}", value, datatype)
}
}
}
pub fn iri(s: impl Into<String>) -> Self {
RdfTerm::Iri(s.into())
}
pub fn blank(id: impl Into<String>) -> Self {
RdfTerm::BlankNode(id.into())
}
pub fn literal(v: impl Into<String>) -> Self {
RdfTerm::PlainLiteral(v.into())
}
pub fn lang_literal(v: impl Into<String>, lang: impl Into<String>) -> Self {
RdfTerm::LangLiteral {
value: v.into(),
lang: lang.into(),
}
}
pub fn typed_literal(v: impl Into<String>, datatype: impl Into<String>) -> Self {
RdfTerm::TypedLiteral {
value: v.into(),
datatype: datatype.into(),
}
}
pub fn value(&self) -> &str {
match self {
RdfTerm::Iri(s) => s,
RdfTerm::BlankNode(s) => s,
RdfTerm::PlainLiteral(s) => s,
RdfTerm::LangLiteral { value, .. } => value,
RdfTerm::TypedLiteral { value, .. } => value,
}
}
pub fn datatype(&self) -> Option<&str> {
match self {
RdfTerm::TypedLiteral { datatype, .. } => Some(datatype),
_ => None,
}
}
pub fn lang(&self) -> Option<&str> {
match self {
RdfTerm::LangLiteral { lang, .. } => Some(lang),
_ => None,
}
}
pub fn is_iri(&self) -> bool {
matches!(self, RdfTerm::Iri(_))
}
pub fn is_blank_node(&self) -> bool {
matches!(self, RdfTerm::BlankNode(_))
}
pub fn is_literal(&self) -> bool {
matches!(
self,
RdfTerm::PlainLiteral(_) | RdfTerm::LangLiteral { .. } | RdfTerm::TypedLiteral { .. }
)
}
}
impl std::fmt::Display for RdfTerm {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
RdfTerm::Iri(iri) => write!(f, "<{}>", iri),
RdfTerm::BlankNode(id) => write!(f, "_:{}", id),
RdfTerm::PlainLiteral(v) => write!(f, "\"{}\"", v),
RdfTerm::LangLiteral { value, lang } => write!(f, "\"{}\"@{}", value, lang),
RdfTerm::TypedLiteral { value, datatype } => {
write!(f, "\"{}\"^^<{}>", value, datatype)
}
}
}
}
#[derive(Debug, Clone)]
pub struct CompactDictionary {
str_to_id: HashMap<String, NodeId>,
id_to_term: Vec<RdfTerm>,
next_id: NodeId,
}
impl CompactDictionary {
pub fn new() -> Self {
Self {
str_to_id: HashMap::new(),
id_to_term: Vec::new(),
next_id: 0,
}
}
pub fn intern(&mut self, term: &RdfTerm) -> NodeId {
let key = term.canonical_key();
if let Some(&id) = self.str_to_id.get(&key) {
return id;
}
let id = self.next_id;
self.next_id = self
.next_id
.checked_add(1)
.expect("NodeId overflow (> 4 billion unique terms)");
self.str_to_id.insert(key, id);
self.id_to_term.push(term.clone());
id
}
pub fn lookup(&self, id: NodeId) -> Option<&RdfTerm> {
self.id_to_term.get(id as usize)
}
pub fn get_id(&self, term: &RdfTerm) -> Option<NodeId> {
let key = term.canonical_key();
self.str_to_id.get(&key).copied()
}
pub fn size(&self) -> usize {
self.id_to_term.len()
}
pub fn memory_estimate_bytes(&self) -> usize {
self.id_to_term
.iter()
.map(|t| {
let key_len = t.canonical_key().len();
let term_len = match t {
RdfTerm::Iri(s) => s.len() + 8,
RdfTerm::BlankNode(s) => s.len() + 8,
RdfTerm::PlainLiteral(s) => s.len() + 8,
RdfTerm::LangLiteral { value, lang } => value.len() + lang.len() + 16,
RdfTerm::TypedLiteral { value, datatype } => value.len() + datatype.len() + 16,
};
key_len + term_len + 16 })
.sum()
}
}
impl Default for CompactDictionary {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone)]
pub struct CompactTripleStore {
dict: CompactDictionary,
spo: Vec<(NodeId, NodeId, NodeId)>,
pso: Vec<(NodeId, NodeId, NodeId)>,
dirty: bool,
}
impl CompactTripleStore {
pub fn new() -> Self {
Self {
dict: CompactDictionary::new(),
spo: Vec::new(),
pso: Vec::new(),
dirty: false,
}
}
pub fn insert(&mut self, s: &RdfTerm, p: &RdfTerm, o: &RdfTerm) {
let s_id = self.dict.intern(s);
let p_id = self.dict.intern(p);
let o_id = self.dict.intern(o);
let triple = (s_id, p_id, o_id);
self.ensure_sorted();
if self.spo.binary_search(&triple).is_ok() {
return; }
self.spo.push(triple);
self.pso.push((p_id, s_id, o_id));
self.dirty = true;
}
pub fn delete(&mut self, s: &RdfTerm, p: &RdfTerm, o: &RdfTerm) -> bool {
let s_id = match self.dict.get_id(s) {
Some(id) => id,
None => return false,
};
let p_id = match self.dict.get_id(p) {
Some(id) => id,
None => return false,
};
let o_id = match self.dict.get_id(o) {
Some(id) => id,
None => return false,
};
self.ensure_sorted();
let triple_spo = (s_id, p_id, o_id);
match self.spo.binary_search(&triple_spo) {
Ok(pos) => {
self.spo.remove(pos);
let triple_pso = (p_id, s_id, o_id);
if let Ok(ppos) = self.pso.binary_search(&triple_pso) {
self.pso.remove(ppos);
}
true
}
Err(_) => false,
}
}
pub fn contains(&self, s: &RdfTerm, p: &RdfTerm, o: &RdfTerm) -> bool {
let s_id = match self.dict.get_id(s) {
Some(id) => id,
None => return false,
};
let p_id = match self.dict.get_id(p) {
Some(id) => id,
None => return false,
};
let o_id = match self.dict.get_id(o) {
Some(id) => id,
None => return false,
};
if self.dirty {
return self.spo.contains(&(s_id, p_id, o_id));
}
self.spo.binary_search(&(s_id, p_id, o_id)).is_ok()
}
pub fn triple_count(&self) -> usize {
self.spo.len()
}
pub fn term_count(&self) -> usize {
self.dict.size()
}
pub fn find_by_subject(&mut self, s: &RdfTerm) -> Vec<(RdfTerm, RdfTerm, RdfTerm)> {
let s_id = match self.dict.get_id(s) {
Some(id) => id,
None => return vec![],
};
self.ensure_sorted();
let first = match self.spo.binary_search_by(|&(si, _, _)| si.cmp(&s_id)) {
Ok(pos) => {
let mut start = pos;
while start > 0 && self.spo[start - 1].0 == s_id {
start -= 1;
}
start
}
Err(_) => return vec![],
};
let mut result = Vec::new();
for &(si, pi, oi) in &self.spo[first..] {
if si != s_id {
break;
}
if let (Some(subj), Some(pred), Some(obj)) = (
self.dict.lookup(si),
self.dict.lookup(pi),
self.dict.lookup(oi),
) {
result.push((subj.clone(), pred.clone(), obj.clone()));
}
}
result
}
pub fn find_by_predicate(&mut self, p: &RdfTerm) -> Vec<(RdfTerm, RdfTerm, RdfTerm)> {
let p_id = match self.dict.get_id(p) {
Some(id) => id,
None => return vec![],
};
self.ensure_sorted();
let first = match self.pso.binary_search_by(|&(pi, _, _)| pi.cmp(&p_id)) {
Ok(pos) => {
let mut start = pos;
while start > 0 && self.pso[start - 1].0 == p_id {
start -= 1;
}
start
}
Err(_) => return vec![],
};
let mut result = Vec::new();
for &(pi, si, oi) in &self.pso[first..] {
if pi != p_id {
break;
}
if let (Some(subj), Some(pred), Some(obj)) = (
self.dict.lookup(si),
self.dict.lookup(pi),
self.dict.lookup(oi),
) {
result.push((subj.clone(), pred.clone(), obj.clone()));
}
}
result
}
pub fn find_by_predicate_object(&mut self, p: &RdfTerm, o: &RdfTerm) -> Vec<RdfTerm> {
let p_id = match self.dict.get_id(p) {
Some(id) => id,
None => return vec![],
};
let o_id = match self.dict.get_id(o) {
Some(id) => id,
None => return vec![],
};
self.ensure_sorted();
let first = match self.pso.binary_search_by(|&(pi, _, _)| pi.cmp(&p_id)) {
Ok(pos) => {
let mut start = pos;
while start > 0 && self.pso[start - 1].0 == p_id {
start -= 1;
}
start
}
Err(_) => return vec![],
};
let mut result = Vec::new();
for &(pi, si, oi) in &self.pso[first..] {
if pi != p_id {
break;
}
if oi == o_id {
if let Some(subj) = self.dict.lookup(si) {
result.push(subj.clone());
}
}
}
result
}
pub fn iter_all(&mut self) -> impl Iterator<Item = (RdfTerm, RdfTerm, RdfTerm)> + '_ {
self.ensure_sorted();
let dict = &self.dict;
self.spo.iter().filter_map(move |&(si, pi, oi)| {
let s = dict.lookup(si)?.clone();
let p = dict.lookup(pi)?.clone();
let o = dict.lookup(oi)?.clone();
Some((s, p, o))
})
}
pub fn memory_estimate_bytes(&self) -> usize {
let triple_bytes = (self.spo.len() + self.pso.len()) * 12; let dict_bytes = self.dict.memory_estimate_bytes();
triple_bytes + dict_bytes
}
pub fn dictionary(&self) -> &CompactDictionary {
&self.dict
}
fn ensure_sorted(&mut self) {
if self.dirty {
self.spo.sort_unstable();
self.pso.sort_unstable();
self.dirty = false;
}
}
pub fn bulk_insert<I>(&mut self, triples: I)
where
I: IntoIterator<Item = (RdfTerm, RdfTerm, RdfTerm)>,
{
for (s, p, o) in triples {
let s_id = self.dict.intern(&s);
let p_id = self.dict.intern(&p);
let o_id = self.dict.intern(&o);
self.spo.push((s_id, p_id, o_id));
self.pso.push((p_id, s_id, o_id));
}
self.dirty = true;
self.ensure_sorted();
self.spo.dedup();
self.pso.dedup();
}
}
impl Default for CompactTripleStore {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn ex(local: &str) -> RdfTerm {
RdfTerm::iri(format!("http://example.org/{}", local))
}
fn lit(v: &str) -> RdfTerm {
RdfTerm::literal(v)
}
#[test]
fn test_insert_and_count() {
let mut store = CompactTripleStore::new();
store.insert(&ex("alice"), &ex("knows"), &ex("bob"));
store.insert(&ex("alice"), &ex("name"), &lit("Alice"));
assert_eq!(store.triple_count(), 2);
}
#[test]
fn test_insert_duplicate() {
let mut store = CompactTripleStore::new();
store.insert(&ex("s"), &ex("p"), &ex("o"));
store.insert(&ex("s"), &ex("p"), &ex("o")); assert_eq!(store.triple_count(), 1);
}
#[test]
fn test_contains() {
let mut store = CompactTripleStore::new();
store.insert(&ex("s"), &ex("p"), &ex("o"));
assert!(store.contains(&ex("s"), &ex("p"), &ex("o")));
assert!(!store.contains(&ex("s"), &ex("p"), &ex("other")));
}
#[test]
fn test_delete() {
let mut store = CompactTripleStore::new();
store.insert(&ex("s"), &ex("p"), &ex("o"));
assert_eq!(store.triple_count(), 1);
let deleted = store.delete(&ex("s"), &ex("p"), &ex("o"));
assert!(deleted);
assert_eq!(store.triple_count(), 0);
let again = store.delete(&ex("s"), &ex("p"), &ex("o"));
assert!(!again);
}
#[test]
fn test_find_by_subject() {
let mut store = CompactTripleStore::new();
store.insert(&ex("alice"), &ex("knows"), &ex("bob"));
store.insert(&ex("alice"), &ex("name"), &lit("Alice"));
store.insert(&ex("bob"), &ex("name"), &lit("Bob"));
let results = store.find_by_subject(&ex("alice"));
assert_eq!(results.len(), 2);
}
#[test]
fn test_find_by_predicate() {
let mut store = CompactTripleStore::new();
store.insert(&ex("alice"), &ex("knows"), &ex("bob"));
store.insert(&ex("alice"), &ex("knows"), &ex("carol"));
store.insert(&ex("alice"), &ex("name"), &lit("Alice"));
let results = store.find_by_predicate(&ex("knows"));
assert_eq!(results.len(), 2);
}
#[test]
fn test_find_by_predicate_object() {
let mut store = CompactTripleStore::new();
store.insert(&ex("alice"), &ex("knows"), &ex("bob"));
store.insert(&ex("carol"), &ex("knows"), &ex("bob"));
store.insert(&ex("alice"), &ex("knows"), &ex("dave"));
let subjects = store.find_by_predicate_object(&ex("knows"), &ex("bob"));
assert_eq!(subjects.len(), 2);
}
#[test]
fn test_bulk_insert() {
let mut store = CompactTripleStore::new();
let triples: Vec<(RdfTerm, RdfTerm, RdfTerm)> = (0..100)
.map(|i| (ex(&format!("s{}", i)), ex("p"), ex(&format!("o{}", i))))
.collect();
store.bulk_insert(triples);
assert_eq!(store.triple_count(), 100);
}
#[test]
fn test_memory_estimate() {
let mut store = CompactTripleStore::new();
store.insert(&ex("s"), &ex("p"), &ex("o"));
let mem = store.memory_estimate_bytes();
assert!(mem > 0);
}
#[test]
fn test_dictionary_intern() {
let mut dict = CompactDictionary::new();
let term = RdfTerm::iri("http://example.org/");
let id1 = dict.intern(&term);
let id2 = dict.intern(&term); assert_eq!(id1, id2);
assert_eq!(dict.size(), 1);
}
#[test]
fn test_dictionary_lookup() {
let mut dict = CompactDictionary::new();
let term = RdfTerm::LangLiteral {
value: "hello".to_string(),
lang: "en".to_string(),
};
let id = dict.intern(&term);
let retrieved = dict.lookup(id).expect("term should be found");
assert_eq!(retrieved, &term);
}
#[test]
fn test_rdf_term_kinds() {
let iri = RdfTerm::iri("http://example.org/");
assert!(iri.is_iri());
assert!(!iri.is_blank_node());
assert!(!iri.is_literal());
let bnode = RdfTerm::blank("b0");
assert!(bnode.is_blank_node());
let plain_lit = RdfTerm::literal("hello");
assert!(plain_lit.is_literal());
let lang_lit = RdfTerm::lang_literal("hello", "en");
assert!(lang_lit.is_literal());
assert_eq!(lang_lit.lang(), Some("en"));
let typed_lit = RdfTerm::typed_literal("42", "http://www.w3.org/2001/XMLSchema#integer");
assert!(typed_lit.is_literal());
assert_eq!(
typed_lit.datatype(),
Some("http://www.w3.org/2001/XMLSchema#integer")
);
}
#[test]
fn test_rdf_term_display() {
let iri = RdfTerm::iri("http://example.org/");
assert_eq!(format!("{}", iri), "<http://example.org/>");
let bnode = RdfTerm::blank("b0");
assert_eq!(format!("{}", bnode), "_:b0");
let plain_lit = RdfTerm::literal("hello");
assert_eq!(format!("{}", plain_lit), "\"hello\"");
}
}