1use crate::model::{BlankNode, Literal, NamedNode, Object, Predicate, Subject};
7use anyhow::{Context, Result};
8use bimap::BiMap;
9use serde::{Deserialize, Serialize};
10use std::fs::{File, OpenOptions};
11use std::io::{BufReader, BufWriter, Write};
12use std::path::Path;
13use std::sync::{Arc, RwLock};
14
15#[derive(Debug)]
17pub struct TermInterner {
18 subjects: Arc<RwLock<BiMap<u32, SubjectTerm>>>,
20 predicates: Arc<RwLock<BiMap<u32, String>>>,
22 objects: Arc<RwLock<BiMap<u32, ObjectTerm>>>,
24 next_subject_id: Arc<RwLock<u32>>,
26 next_predicate_id: Arc<RwLock<u32>>,
27 next_object_id: Arc<RwLock<u32>>,
28 stats: Arc<RwLock<InternerStats>>,
30}
31
32#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
34pub enum SubjectTerm {
35 NamedNode(String),
36 BlankNode(String),
37}
38
39#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
41pub enum ObjectTerm {
42 NamedNode(String),
43 BlankNode(String),
44 Literal {
45 value: String,
46 datatype: Option<String>,
47 language: Option<String>,
48 },
49}
50
51#[derive(Debug, Clone, Default)]
53pub struct InternerStats {
54 pub subject_count: usize,
55 pub predicate_count: usize,
56 pub object_count: usize,
57 pub total_lookups: usize,
58 pub cache_hits: usize,
59 pub memory_bytes: usize,
60}
61
62impl InternerStats {
63 pub fn hit_ratio(&self) -> f64 {
65 if self.total_lookups == 0 {
66 0.0
67 } else {
68 self.cache_hits as f64 / self.total_lookups as f64
69 }
70 }
71
72 pub fn total_terms(&self) -> usize {
74 self.subject_count + self.predicate_count + self.object_count
75 }
76}
77
78impl TermInterner {
79 pub fn new() -> Self {
81 TermInterner {
82 subjects: Arc::new(RwLock::new(BiMap::new())),
83 predicates: Arc::new(RwLock::new(BiMap::new())),
84 objects: Arc::new(RwLock::new(BiMap::new())),
85 next_subject_id: Arc::new(RwLock::new(0)),
86 next_predicate_id: Arc::new(RwLock::new(0)),
87 next_object_id: Arc::new(RwLock::new(0)),
88 stats: Arc::new(RwLock::new(InternerStats::default())),
89 }
90 }
91
92 pub fn intern_subject(&self, subject: &Subject) -> u32 {
94 let term = match subject {
95 Subject::NamedNode(n) => SubjectTerm::NamedNode(n.as_str().to_string()),
96 Subject::BlankNode(b) => SubjectTerm::BlankNode(b.id().to_string()),
97 Subject::Variable(_) | Subject::QuotedTriple(_) => {
98 panic!("Variables and quoted triples cannot be interned in storage")
99 }
100 };
101
102 {
104 let subjects = self.subjects.read().expect("subjects lock poisoned");
105 if let Some(&id) = subjects.get_by_right(&term) {
106 let mut stats = self.stats.write().expect("stats lock poisoned");
107 stats.total_lookups += 1;
108 stats.cache_hits += 1;
109 return id;
110 }
111 }
112
113 let mut subjects = self.subjects.write().expect("subjects lock poisoned");
115 if let Some(&id) = subjects.get_by_right(&term) {
117 let mut stats = self.stats.write().expect("stats lock poisoned");
118 stats.total_lookups += 1;
119 stats.cache_hits += 1;
120 return id;
121 }
122
123 let mut next_id = self
124 .next_subject_id
125 .write()
126 .expect("next_subject_id lock poisoned");
127 let id = *next_id;
128 *next_id += 1;
129 subjects.insert(id, term.clone());
130
131 let mut stats = self.stats.write().expect("stats lock poisoned");
133 stats.total_lookups += 1;
134 stats.subject_count += 1;
135 stats.memory_bytes += estimate_subject_memory(&term);
136
137 id
138 }
139
140 pub fn intern_predicate(&self, predicate: &Predicate) -> u32 {
142 let iri = match predicate {
143 Predicate::NamedNode(n) => n.as_str().to_string(),
144 Predicate::Variable(_) => {
145 panic!("Variables cannot be interned as predicates in storage")
146 }
147 };
148
149 {
151 let predicates = self.predicates.read().expect("predicates lock poisoned");
152 if let Some(&id) = predicates.get_by_right(&iri) {
153 let mut stats = self.stats.write().expect("stats lock poisoned");
154 stats.total_lookups += 1;
155 stats.cache_hits += 1;
156 return id;
157 }
158 }
159
160 let mut predicates = self.predicates.write().expect("predicates lock poisoned");
162 if let Some(&id) = predicates.get_by_right(&iri) {
164 let mut stats = self.stats.write().expect("stats lock poisoned");
165 stats.total_lookups += 1;
166 stats.cache_hits += 1;
167 return id;
168 }
169
170 let mut next_id = self
171 .next_predicate_id
172 .write()
173 .expect("next_predicate_id lock poisoned");
174 let id = *next_id;
175 *next_id += 1;
176 predicates.insert(id, iri.clone());
177
178 let mut stats = self.stats.write().expect("stats lock poisoned");
180 stats.total_lookups += 1;
181 stats.predicate_count += 1;
182 stats.memory_bytes += iri.len() + 4; id
185 }
186
187 pub fn intern_object(&self, object: &Object) -> u32 {
189 let term = match object {
190 Object::NamedNode(n) => ObjectTerm::NamedNode(n.as_str().to_string()),
191 Object::BlankNode(b) => ObjectTerm::BlankNode(b.id().to_string()),
192 Object::Literal(l) => ObjectTerm::Literal {
193 value: l.value().to_string(),
194 datatype: Some(l.datatype().as_str().to_string()),
195 language: l.language().map(|lang| lang.to_string()),
196 },
197 Object::Variable(_) | Object::QuotedTriple(_) => {
198 panic!("Variables and quoted triples cannot be interned in storage")
199 }
200 };
201
202 {
204 let objects = self.objects.read().expect("objects lock poisoned");
205 if let Some(&id) = objects.get_by_right(&term) {
206 let mut stats = self.stats.write().expect("stats lock poisoned");
207 stats.total_lookups += 1;
208 stats.cache_hits += 1;
209 return id;
210 }
211 }
212
213 let mut objects = self.objects.write().expect("objects lock poisoned");
215 if let Some(&id) = objects.get_by_right(&term) {
217 let mut stats = self.stats.write().expect("stats lock poisoned");
218 stats.total_lookups += 1;
219 stats.cache_hits += 1;
220 return id;
221 }
222
223 let mut next_id = self
224 .next_object_id
225 .write()
226 .expect("next_object_id lock poisoned");
227 let id = *next_id;
228 *next_id += 1;
229 objects.insert(id, term.clone());
230
231 let mut stats = self.stats.write().expect("stats lock poisoned");
233 stats.total_lookups += 1;
234 stats.object_count += 1;
235 stats.memory_bytes += estimate_object_memory(&term);
236
237 id
238 }
239
240 pub fn get_subject(&self, id: u32) -> Option<Subject> {
242 let subjects = self.subjects.read().expect("subjects lock poisoned");
243 subjects.get_by_left(&id).map(|term| match term {
244 SubjectTerm::NamedNode(iri) => {
245 Subject::NamedNode(NamedNode::new(iri).expect("interned IRI should be valid"))
246 }
247 SubjectTerm::BlankNode(blank_id) => Subject::BlankNode(
248 BlankNode::new(blank_id).expect("interned blank node ID should be valid"),
249 ),
250 })
251 }
252
253 pub fn get_subject_id(&self, subject: &Subject) -> Option<u32> {
255 let term = match subject {
256 Subject::NamedNode(n) => SubjectTerm::NamedNode(n.as_str().to_string()),
257 Subject::BlankNode(b) => SubjectTerm::BlankNode(b.id().to_string()),
258 Subject::Variable(_) | Subject::QuotedTriple(_) => {
259 panic!("Variables and quoted triples cannot be interned in storage")
260 }
261 };
262 let subjects = self.subjects.read().expect("subjects lock poisoned");
263 subjects.get_by_right(&term).copied()
264 }
265
266 pub fn get_predicate(&self, id: u32) -> Option<Predicate> {
268 let predicates = self.predicates.read().expect("predicates lock poisoned");
269 predicates.get_by_left(&id).map(|iri| {
270 Predicate::NamedNode(NamedNode::new(iri).expect("interned IRI should be valid"))
271 })
272 }
273
274 pub fn get_predicate_id(&self, predicate: &Predicate) -> Option<u32> {
276 let iri = match predicate {
277 Predicate::NamedNode(n) => n.as_str().to_string(),
278 Predicate::Variable(_) => {
279 panic!("Variables cannot be interned as predicates in storage")
280 }
281 };
282 let predicates = self.predicates.read().expect("predicates lock poisoned");
283 predicates.get_by_right(&iri).copied()
284 }
285
286 pub fn get_object(&self, id: u32) -> Option<Object> {
288 let objects = self.objects.read().expect("objects lock poisoned");
289 objects.get_by_left(&id).map(|term| match term {
290 ObjectTerm::NamedNode(iri) => {
291 Object::NamedNode(NamedNode::new(iri).expect("interned IRI should be valid"))
292 }
293 ObjectTerm::BlankNode(id) => Object::BlankNode(
294 BlankNode::new(id).expect("interned blank node ID should be valid"),
295 ),
296 ObjectTerm::Literal {
297 value,
298 datatype,
299 language,
300 } => {
301 let literal = if let Some(lang) = language {
302 Literal::new_language_tagged_literal(value, lang)
303 .expect("interned language tag should be valid")
304 } else if let Some(dt) = datatype {
305 Literal::new_typed(
306 value,
307 NamedNode::new(dt).expect("interned datatype IRI should be valid"),
308 )
309 } else {
310 Literal::new(value)
311 };
312 Object::Literal(literal)
313 }
314 })
315 }
316
317 pub fn get_object_id(&self, object: &Object) -> Option<u32> {
319 let term = match object {
320 Object::NamedNode(n) => ObjectTerm::NamedNode(n.as_str().to_string()),
321 Object::BlankNode(b) => ObjectTerm::BlankNode(b.id().to_string()),
322 Object::Literal(l) => ObjectTerm::Literal {
323 value: l.value().to_string(),
324 datatype: Some(l.datatype().as_str().to_string()),
325 language: l.language().map(|lang| lang.to_string()),
326 },
327 Object::Variable(_) | Object::QuotedTriple(_) => {
328 panic!("Variables and quoted triples cannot be interned in storage")
329 }
330 };
331 let objects = self.objects.read().expect("objects lock poisoned");
332 objects.get_by_right(&term).copied()
333 }
334
335 pub fn stats(&self) -> InternerStats {
337 self.stats.read().expect("stats lock poisoned").clone()
338 }
339
340 pub fn clear(&self) {
342 let mut subjects = self.subjects.write().expect("subjects lock poisoned");
343 let mut predicates = self.predicates.write().expect("predicates lock poisoned");
344 let mut objects = self.objects.write().expect("objects lock poisoned");
345
346 subjects.clear();
347 predicates.clear();
348 objects.clear();
349
350 *self
351 .next_subject_id
352 .write()
353 .expect("next_subject_id lock poisoned") = 0;
354 *self
355 .next_predicate_id
356 .write()
357 .expect("next_predicate_id lock poisoned") = 0;
358 *self
359 .next_object_id
360 .write()
361 .expect("next_object_id lock poisoned") = 0;
362
363 let mut stats = self.stats.write().expect("stats lock poisoned");
364 *stats = InternerStats::default();
365 }
366
367 pub fn memory_usage(&self) -> usize {
369 self.stats.read().expect("stats lock poisoned").memory_bytes
370 }
371
372 pub fn intern_named_node(&self, node: &NamedNode) -> u64 {
374 self.intern_subject(&Subject::NamedNode(node.clone())) as u64
375 }
376
377 pub fn intern_blank_node(&self, node: &BlankNode) -> u64 {
379 self.intern_subject(&Subject::BlankNode(node.clone())) as u64
380 }
381
382 pub fn intern_literal(&self, literal: &Literal) -> u64 {
384 self.intern_object(&Object::Literal(literal.clone())) as u64
385 }
386
387 pub fn get_named_node_id(&self, node: &NamedNode) -> Option<u64> {
389 self.get_subject_id(&Subject::NamedNode(node.clone()))
390 .map(|id| id as u64)
391 }
392
393 pub fn get_blank_node_id(&self, node: &BlankNode) -> Option<u64> {
395 self.get_subject_id(&Subject::BlankNode(node.clone()))
396 .map(|id| id as u64)
397 }
398
399 pub fn get_literal_id(&self, literal: &Literal) -> Option<u64> {
401 self.get_object_id(&Object::Literal(literal.clone()))
402 .map(|id| id as u64)
403 }
404
405 pub fn save<P: AsRef<Path>>(&self, path: P) -> Result<()> {
407 let file = OpenOptions::new()
408 .write(true)
409 .create(true)
410 .truncate(true)
411 .open(path)
412 .context("Failed to create terms file")?;
413
414 let mut writer = BufWriter::new(file);
415
416 {
418 let subjects = self.subjects.read().expect("subjects lock poisoned");
419 let subject_data: Vec<(u32, SubjectTerm)> = subjects
420 .iter()
421 .map(|(id, term)| (*id, term.clone()))
422 .collect();
423 oxicode::serde::encode_into_std_write(
424 &subject_data,
425 &mut writer,
426 oxicode::config::standard(),
427 )
428 .map_err(|e| anyhow::anyhow!("Failed to serialize subjects: {}", e))?;
429 }
430
431 {
433 let predicates = self.predicates.read().expect("predicates lock poisoned");
434 let predicate_data: Vec<(u32, String)> = predicates
435 .iter()
436 .map(|(id, iri)| (*id, iri.clone()))
437 .collect();
438 oxicode::serde::encode_into_std_write(
439 &predicate_data,
440 &mut writer,
441 oxicode::config::standard(),
442 )
443 .map_err(|e| anyhow::anyhow!("Failed to serialize predicates: {}", e))?;
444 }
445
446 {
448 let objects = self.objects.read().expect("objects lock poisoned");
449 let object_data: Vec<(u32, ObjectTerm)> = objects
450 .iter()
451 .map(|(id, term)| (*id, term.clone()))
452 .collect();
453 oxicode::serde::encode_into_std_write(
454 &object_data,
455 &mut writer,
456 oxicode::config::standard(),
457 )
458 .map_err(|e| anyhow::anyhow!("Failed to serialize objects: {}", e))?;
459 }
460
461 let next_subject_id = *self
463 .next_subject_id
464 .read()
465 .expect("next_subject_id lock poisoned");
466 let next_predicate_id = *self
467 .next_predicate_id
468 .read()
469 .expect("next_predicate_id lock poisoned");
470 let next_object_id = *self
471 .next_object_id
472 .read()
473 .expect("next_object_id lock poisoned");
474
475 oxicode::serde::encode_into_std_write(
476 &next_subject_id,
477 &mut writer,
478 oxicode::config::standard(),
479 )?;
480 oxicode::serde::encode_into_std_write(
481 &next_predicate_id,
482 &mut writer,
483 oxicode::config::standard(),
484 )?;
485 oxicode::serde::encode_into_std_write(
486 &next_object_id,
487 &mut writer,
488 oxicode::config::standard(),
489 )?;
490
491 writer.flush()?;
492 Ok(())
493 }
494
495 pub fn load<P: AsRef<Path>>(path: P) -> Result<Self> {
497 let file = File::open(path).context("Failed to open terms file")?;
498 let mut reader = BufReader::new(file);
499
500 let (subject_data, _): (Vec<(u32, SubjectTerm)>, _) =
502 oxicode::serde::decode_from_std_read(&mut reader, oxicode::config::standard())
503 .map_err(|e| anyhow::anyhow!("Failed to deserialize subjects: {}", e))?;
504 let mut subjects = BiMap::new();
505 for (id, term) in subject_data {
506 subjects.insert(id, term);
507 }
508
509 let (predicate_data, _): (Vec<(u32, String)>, _) =
511 oxicode::serde::decode_from_std_read(&mut reader, oxicode::config::standard())
512 .map_err(|e| anyhow::anyhow!("Failed to deserialize predicates: {}", e))?;
513 let mut predicates = BiMap::new();
514 for (id, iri) in predicate_data {
515 predicates.insert(id, iri);
516 }
517
518 let (object_data, _): (Vec<(u32, ObjectTerm)>, _) =
520 oxicode::serde::decode_from_std_read(&mut reader, oxicode::config::standard())
521 .map_err(|e| anyhow::anyhow!("Failed to deserialize objects: {}", e))?;
522 let mut objects = BiMap::new();
523 for (id, term) in object_data {
524 objects.insert(id, term);
525 }
526
527 let (next_subject_id, _): (u32, _) =
529 oxicode::serde::decode_from_std_read(&mut reader, oxicode::config::standard())?;
530 let (next_predicate_id, _): (u32, _) =
531 oxicode::serde::decode_from_std_read(&mut reader, oxicode::config::standard())?;
532 let (next_object_id, _): (u32, _) =
533 oxicode::serde::decode_from_std_read(&mut reader, oxicode::config::standard())?;
534
535 let stats = InternerStats {
537 subject_count: subjects.len(),
538 predicate_count: predicates.len(),
539 object_count: objects.len(),
540 ..Default::default()
541 };
542
543 Ok(TermInterner {
544 subjects: Arc::new(RwLock::new(subjects)),
545 predicates: Arc::new(RwLock::new(predicates)),
546 objects: Arc::new(RwLock::new(objects)),
547 next_subject_id: Arc::new(RwLock::new(next_subject_id)),
548 next_predicate_id: Arc::new(RwLock::new(next_predicate_id)),
549 next_object_id: Arc::new(RwLock::new(next_object_id)),
550 stats: Arc::new(RwLock::new(stats)),
551 })
552 }
553}
554
555impl Default for TermInterner {
556 fn default() -> Self {
557 Self::new()
558 }
559}
560
561fn estimate_subject_memory(term: &SubjectTerm) -> usize {
563 match term {
564 SubjectTerm::NamedNode(iri) => iri.len() + 4 + 8, SubjectTerm::BlankNode(id) => id.len() + 4 + 8,
566 }
567}
568
569fn estimate_object_memory(term: &ObjectTerm) -> usize {
571 match term {
572 ObjectTerm::NamedNode(iri) => iri.len() + 4 + 8,
573 ObjectTerm::BlankNode(id) => id.len() + 4 + 8,
574 ObjectTerm::Literal {
575 value,
576 datatype,
577 language,
578 } => {
579 value.len()
580 + datatype.as_ref().map_or(0, |s| s.len())
581 + language.as_ref().map_or(0, |s| s.len())
582 + 4
583 + 24 }
585 }
586}
587
588#[cfg(test)]
589mod tests {
590 use super::*;
591
592 #[test]
593 fn test_subject_interning() {
594 let interner = TermInterner::new();
595
596 let subject1 =
597 Subject::NamedNode(NamedNode::new("http://example.org/s1").expect("valid IRI"));
598 let subject2 = Subject::BlankNode(BlankNode::new("b1").expect("valid blank node id"));
599 let subject3 =
600 Subject::NamedNode(NamedNode::new("http://example.org/s1").expect("valid IRI"));
601
602 let id1 = interner.intern_subject(&subject1);
603 let id2 = interner.intern_subject(&subject2);
604 let id3 = interner.intern_subject(&subject3);
605
606 assert_eq!(id1, id3);
608 assert_ne!(id1, id2);
609
610 assert_eq!(interner.get_subject(id1), Some(subject1.clone()));
612 assert_eq!(interner.get_subject(id2), Some(subject2));
613
614 let stats = interner.stats();
616 assert_eq!(stats.subject_count, 2);
617 assert_eq!(stats.cache_hits, 1); }
619
620 #[test]
621 fn test_predicate_interning() {
622 let interner = TermInterner::new();
623
624 let pred1 =
625 Predicate::NamedNode(NamedNode::new("http://example.org/p1").expect("valid IRI"));
626 let pred2 =
627 Predicate::NamedNode(NamedNode::new("http://example.org/p2").expect("valid IRI"));
628 let pred3 =
629 Predicate::NamedNode(NamedNode::new("http://example.org/p1").expect("valid IRI"));
630
631 let id1 = interner.intern_predicate(&pred1);
632 let id2 = interner.intern_predicate(&pred2);
633 let id3 = interner.intern_predicate(&pred3);
634
635 assert_eq!(id1, id3);
636 assert_ne!(id1, id2);
637
638 assert_eq!(interner.get_predicate(id1), Some(pred1));
639 assert_eq!(interner.get_predicate(id2), Some(pred2));
640 }
641
642 #[test]
643 fn test_object_interning() {
644 let interner = TermInterner::new();
645
646 let obj1 = Object::Literal(Literal::new("test"));
647 let obj2 = Object::NamedNode(NamedNode::new("http://example.org/o1").expect("valid IRI"));
648 let obj3 = Object::Literal(Literal::new("test"));
649
650 let id1 = interner.intern_object(&obj1);
651 let id2 = interner.intern_object(&obj2);
652 let id3 = interner.intern_object(&obj3);
653
654 assert_eq!(id1, id3);
655 assert_ne!(id1, id2);
656
657 assert_eq!(interner.get_object(id1), Some(obj1));
658 assert_eq!(interner.get_object(id2), Some(obj2));
659 }
660
661 #[test]
662 fn test_clear() {
663 let interner = TermInterner::new();
664
665 let subject =
666 Subject::NamedNode(NamedNode::new("http://example.org/s1").expect("valid IRI"));
667 let id = interner.intern_subject(&subject);
668
669 assert!(interner.get_subject(id).is_some());
670
671 interner.clear();
672
673 assert!(interner.get_subject(id).is_none());
674 assert_eq!(interner.stats().total_terms(), 0);
675 }
676
677 #[test]
678 fn test_concurrent_access() {
679 use std::sync::Arc;
680 use std::thread;
681
682 let interner = Arc::new(TermInterner::new());
683 let mut handles = vec![];
684
685 for i in 0..10 {
686 let interner_clone = Arc::clone(&interner);
687 let handle = thread::spawn(move || {
688 let subject = Subject::NamedNode(
689 NamedNode::new(format!("http://example.org/s{}", i % 3))
690 .expect("valid IRI from format"),
691 );
692 interner_clone.intern_subject(&subject)
693 });
694 handles.push(handle);
695 }
696
697 let ids: Vec<u32> = handles
698 .into_iter()
699 .map(|h| h.join().expect("thread should not panic"))
700 .collect();
701
702 let unique_ids: std::collections::HashSet<_> = ids.iter().collect();
704 assert!(unique_ids.len() <= 3);
705 }
706}